mirror of https://github.com/apache/lucene.git
LUCENE-7626: IndexWriter no longer accepts broken offsets
This commit is contained in:
parent
5b3565ed7e
commit
64b86331c2
|
@ -29,6 +29,9 @@ API Changes
|
|||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-7626: IndexWriter will no longer accept broken token offsets
|
||||
(Mike McCandless)
|
||||
|
||||
Improvements
|
||||
|
||||
* LUCENE-7489: Better storage of sparse doc-values fields with the default
|
||||
|
|
|
@ -0,0 +1,78 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
|
||||
/**
|
||||
* A filter to correct offsets that illegally go backwards.
|
||||
*
|
||||
* @deprecated Fix the token filters that create broken offsets in the first place.
|
||||
*/
|
||||
@Deprecated
|
||||
public final class FixBrokenOffsetsFilter extends TokenFilter {
|
||||
|
||||
private int lastStartOffset;
|
||||
private int lastEndOffset;
|
||||
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
public FixBrokenOffsetsFilter(TokenStream in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken() == false) {
|
||||
return false;
|
||||
}
|
||||
fixOffsets();
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
super.end();
|
||||
fixOffsets();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
lastStartOffset = 0;
|
||||
lastEndOffset = 0;
|
||||
}
|
||||
|
||||
private void fixOffsets() {
|
||||
int startOffset = offsetAtt.startOffset();
|
||||
int endOffset = offsetAtt.endOffset();
|
||||
if (startOffset < lastStartOffset) {
|
||||
startOffset = lastStartOffset;
|
||||
}
|
||||
if (endOffset < startOffset) {
|
||||
endOffset = startOffset;
|
||||
}
|
||||
offsetAtt.setOffset(startOffset, endOffset);
|
||||
lastStartOffset = startOffset;
|
||||
lastEndOffset = endOffset;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link FixBrokenOffsetsFilter}.
|
||||
*/
|
||||
public class FixBrokenOffsetsFilterFactory extends TokenFilterFactory {
|
||||
|
||||
/** Sole constructor */
|
||||
public FixBrokenOffsetsFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new FixBrokenOffsetsFilter(input);
|
||||
}
|
||||
}
|
|
@ -64,6 +64,7 @@ org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory
|
|||
org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.DateRecognizerFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.FingerprintFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.FixBrokenOffsetsFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.KeepWordFilterFactory
|
||||
org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory
|
||||
|
|
|
@ -0,0 +1,50 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CannedTokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
||||
public class TestFixBrokenOffsetsFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testBogusTermVectors() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
|
||||
Document doc = new Document();
|
||||
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
ft.setStoreTermVectors(true);
|
||||
ft.setStoreTermVectorOffsets(true);
|
||||
Field field = new Field("foo", "", ft);
|
||||
field.setTokenStream(new FixBrokenOffsetsFilter(new CannedTokenStream(
|
||||
new Token("bar", 5, 10), new Token("bar", 1, 4)
|
||||
)));
|
||||
doc.add(field);
|
||||
iw.addDocument(doc);
|
||||
iw.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,125 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.index;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.SuppressForbidden;
|
||||
|
||||
/**
|
||||
* Command-line tool that reads from a source index and
|
||||
* writes to a dest index, correcting any broken offsets
|
||||
* in the process.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class FixBrokenOffsets {
|
||||
public SegmentInfos infos;
|
||||
|
||||
FSDirectory fsDir;
|
||||
|
||||
Path dir;
|
||||
|
||||
@SuppressForbidden(reason = "System.out required: command line tool")
|
||||
public static void main(String[] args) throws IOException {
|
||||
if (args.length < 2) {
|
||||
System.err.println("Usage: FixBrokenOffsetse <srcDir> <destDir>");
|
||||
return;
|
||||
}
|
||||
Path srcPath = Paths.get(args[0]);
|
||||
if (!Files.exists(srcPath)) {
|
||||
throw new RuntimeException("srcPath " + srcPath.toAbsolutePath() + " doesn't exist");
|
||||
}
|
||||
Path destPath = Paths.get(args[1]);
|
||||
if (Files.exists(destPath)) {
|
||||
throw new RuntimeException("destPath " + destPath.toAbsolutePath() + " already exists; please remove it and re-run");
|
||||
}
|
||||
Directory srcDir = FSDirectory.open(srcPath);
|
||||
DirectoryReader reader = DirectoryReader.open(srcDir);
|
||||
|
||||
List<LeafReaderContext> leaves = reader.leaves();
|
||||
CodecReader[] filtered = new CodecReader[leaves.size()];
|
||||
for(int i=0;i<leaves.size();i++) {
|
||||
filtered[i] = SlowCodecReaderWrapper.wrap(new FilterLeafReader(leaves.get(i).reader()) {
|
||||
@Override
|
||||
public Fields getTermVectors(int docID) throws IOException {
|
||||
Fields termVectors = in.getTermVectors(docID);
|
||||
if (termVectors == null) {
|
||||
return null;
|
||||
}
|
||||
return new FilterFields(termVectors) {
|
||||
@Override
|
||||
public Terms terms(String field) throws IOException {
|
||||
return new FilterTerms(super.terms(field)) {
|
||||
@Override
|
||||
public TermsEnum iterator() throws IOException {
|
||||
return new FilterTermsEnum(super.iterator()) {
|
||||
@Override
|
||||
public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
|
||||
return new FilterPostingsEnum(super.postings(reuse, flags)) {
|
||||
int nextLastStartOffset = 0;
|
||||
int lastStartOffset = 0;
|
||||
|
||||
@Override
|
||||
public int nextPosition() throws IOException {
|
||||
int pos = super.nextPosition();
|
||||
lastStartOffset = nextLastStartOffset;
|
||||
nextLastStartOffset = startOffset();
|
||||
return pos;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() throws IOException {
|
||||
int offset = super.startOffset();
|
||||
if (offset < lastStartOffset) {
|
||||
offset = lastStartOffset;
|
||||
}
|
||||
return offset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
int offset = super.endOffset();
|
||||
if (offset < lastStartOffset) {
|
||||
offset = lastStartOffset;
|
||||
}
|
||||
return offset;
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
Directory destDir = FSDirectory.open(destPath);
|
||||
IndexWriter writer = new IndexWriter(destDir, new IndexWriterConfig());
|
||||
writer.addIndexes(filtered);
|
||||
IOUtils.close(writer, reader, srcDir, destDir);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<!-- not a package-info.java, because we already defined this package in core/ -->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
<title>Tools for handling backwards compatibility issues with indices.</title>
|
||||
</head>
|
||||
<body>
|
||||
Tools for handling backwards compatibility issues with indices.
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,114 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.index;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.store.MockDirectoryWrapper;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
public class TestFixBrokenOffsets extends LuceneTestCase {
|
||||
|
||||
// Run this in Lucene 6.x:
|
||||
//
|
||||
// ant test -Dtestcase=TestFixBrokenOffsets -Dtestmethod=testCreateBrokenOffsetsIndex -Dtests.codec=default -Dtests.useSecurityManager=false
|
||||
/*
|
||||
public void testCreateBrokenOffsetsIndex() throws IOException {
|
||||
|
||||
Path indexDir = Paths.get("/tmp/brokenoffsets");
|
||||
Files.deleteIfExists(indexDir);
|
||||
Directory dir = newFSDirectory(indexDir);
|
||||
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig());
|
||||
|
||||
Document doc = new Document();
|
||||
FieldType fieldType = new FieldType(TextField.TYPE_STORED);
|
||||
fieldType.setStoreTermVectors(true);
|
||||
fieldType.setStoreTermVectorPositions(true);
|
||||
fieldType.setStoreTermVectorOffsets(true);
|
||||
Field field = new Field("foo", "bar", fieldType);
|
||||
field.setTokenStream(new CannedTokenStream(new Token("foo", 10, 13), new Token("foo", 7, 9)));
|
||||
doc.add(field);
|
||||
writer.addDocument(doc);
|
||||
writer.commit();
|
||||
|
||||
// 2nd segment
|
||||
doc = new Document();
|
||||
field = new Field("foo", "bar", fieldType);
|
||||
field.setTokenStream(new CannedTokenStream(new Token("bar", 15, 17), new Token("bar", 1, 5)));
|
||||
doc.add(field);
|
||||
writer.addDocument(doc);
|
||||
|
||||
writer.close();
|
||||
|
||||
dir.close();
|
||||
}
|
||||
*/
|
||||
|
||||
public void testFixBrokenOffsetsIndex() throws IOException {
|
||||
InputStream resource = getClass().getResourceAsStream("index.630.brokenoffsets.zip");
|
||||
assertNotNull("Broken offsets index not found", resource);
|
||||
Path path = createTempDir("brokenoffsets");
|
||||
TestUtil.unzip(resource, path);
|
||||
Directory dir = FSDirectory.open(path);
|
||||
|
||||
// OK: index is 6.3.0 so offsets not checked:
|
||||
TestUtil.checkIndex(dir);
|
||||
|
||||
MockDirectoryWrapper tmpDir = newMockDirectory();
|
||||
tmpDir.setCheckIndexOnClose(false);
|
||||
IndexWriter w = new IndexWriter(tmpDir, new IndexWriterConfig());
|
||||
w.addIndexes(dir);
|
||||
w.close();
|
||||
// OK: addIndexes(Directory...) also keeps version as 6.3.0, so offsets not checked:
|
||||
TestUtil.checkIndex(tmpDir);
|
||||
tmpDir.close();
|
||||
|
||||
final MockDirectoryWrapper tmpDir2 = newMockDirectory();
|
||||
tmpDir2.setCheckIndexOnClose(false);
|
||||
w = new IndexWriter(tmpDir2, new IndexWriterConfig());
|
||||
DirectoryReader reader = DirectoryReader.open(dir);
|
||||
List<LeafReaderContext> leaves = reader.leaves();
|
||||
CodecReader[] codecReaders = new CodecReader[leaves.size()];
|
||||
for(int i=0;i<leaves.size();i++) {
|
||||
codecReaders[i] = (CodecReader) leaves.get(i).reader();
|
||||
}
|
||||
w.addIndexes(codecReaders);
|
||||
w.close();
|
||||
|
||||
// NOT OK: broken offsets were copied into a 7.0 segment:
|
||||
ByteArrayOutputStream output = new ByteArrayOutputStream(1024);
|
||||
RuntimeException re = expectThrows(RuntimeException.class, () -> {TestUtil.checkIndex(tmpDir2, false, true, output);});
|
||||
assertEquals("term [66 6f 6f]: doc 0: pos 1: startOffset 7 < lastStartOffset 10; consider using the FixBrokenOffsets tool in Lucene's backward-codecs module to correct your index", re.getMessage());
|
||||
tmpDir2.close();
|
||||
|
||||
// Now run the tool and confirm the broken offsets are fixed:
|
||||
Path path2 = createTempDir("fixedbrokenoffsets").resolve("subdir");
|
||||
FixBrokenOffsets.main(new String[] {path.toString(), path2.toString()});
|
||||
Directory tmpDir3 = FSDirectory.open(path2);
|
||||
TestUtil.checkIndex(tmpDir3);
|
||||
tmpDir3.close();
|
||||
|
||||
dir.close();
|
||||
}
|
||||
}
|
Binary file not shown.
|
@ -740,13 +740,13 @@ public final class CheckIndex implements Closeable {
|
|||
segInfoStat.fieldNormStatus = testFieldNorms(reader, infoStream, failFast);
|
||||
|
||||
// Test the Term Index
|
||||
segInfoStat.termIndexStatus = testPostings(reader, infoStream, verbose, failFast);
|
||||
segInfoStat.termIndexStatus = testPostings(reader, infoStream, verbose, failFast, version);
|
||||
|
||||
// Test Stored Fields
|
||||
segInfoStat.storedFieldStatus = testStoredFields(reader, infoStream, failFast);
|
||||
|
||||
// Test Term Vectors
|
||||
segInfoStat.termVectorStatus = testTermVectors(reader, infoStream, verbose, crossCheckTermVectors, failFast);
|
||||
segInfoStat.termVectorStatus = testTermVectors(reader, infoStream, verbose, crossCheckTermVectors, failFast, version);
|
||||
|
||||
// Test Docvalues
|
||||
segInfoStat.docValuesStatus = testDocValues(reader, infoStream, failFast);
|
||||
|
@ -1205,7 +1205,7 @@ public final class CheckIndex implements Closeable {
|
|||
* checks Fields api is consistent with itself.
|
||||
* searcher is optional, to verify with queries. Can be null.
|
||||
*/
|
||||
private static Status.TermIndexStatus checkFields(Fields fields, Bits liveDocs, int maxDoc, FieldInfos fieldInfos, boolean doPrint, boolean isVectors, PrintStream infoStream, boolean verbose) throws IOException {
|
||||
private static Status.TermIndexStatus checkFields(Fields fields, Bits liveDocs, int maxDoc, FieldInfos fieldInfos, boolean doPrint, boolean isVectors, PrintStream infoStream, boolean verbose, Version version) throws IOException {
|
||||
// TODO: we should probably return our own stats thing...?!
|
||||
long startNS;
|
||||
if (doPrint) {
|
||||
|
@ -1461,14 +1461,13 @@ public final class CheckIndex implements Closeable {
|
|||
if (hasOffsets) {
|
||||
int startOffset = postings.startOffset();
|
||||
int endOffset = postings.endOffset();
|
||||
// NOTE: we cannot enforce any bounds whatsoever on vectors... they were a free-for-all before?
|
||||
// but for offsets in the postings lists these checks are fine: they were always enforced by IndexWriter
|
||||
if (!isVectors) {
|
||||
// In Lucene 7 we fixed IndexWriter to also enforce term vector offsets
|
||||
if (isVectors == false || version.onOrAfter(Version.LUCENE_7_0_0)) {
|
||||
if (startOffset < 0) {
|
||||
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": startOffset " + startOffset + " is out of bounds");
|
||||
}
|
||||
if (startOffset < lastOffset) {
|
||||
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": startOffset " + startOffset + " < lastStartOffset " + lastOffset);
|
||||
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": startOffset " + startOffset + " < lastStartOffset " + lastOffset + "; consider using the FixBrokenOffsets tool in Lucene's backward-codecs module to correct your index");
|
||||
}
|
||||
if (endOffset < 0) {
|
||||
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": endOffset " + endOffset + " is out of bounds");
|
||||
|
@ -1742,15 +1741,15 @@ public final class CheckIndex implements Closeable {
|
|||
* Test the term index.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public static Status.TermIndexStatus testPostings(CodecReader reader, PrintStream infoStream) throws IOException {
|
||||
return testPostings(reader, infoStream, false, false);
|
||||
public static Status.TermIndexStatus testPostings(CodecReader reader, PrintStream infoStream, Version version) throws IOException {
|
||||
return testPostings(reader, infoStream, false, false, version);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the term index.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public static Status.TermIndexStatus testPostings(CodecReader reader, PrintStream infoStream, boolean verbose, boolean failFast) throws IOException {
|
||||
public static Status.TermIndexStatus testPostings(CodecReader reader, PrintStream infoStream, boolean verbose, boolean failFast, Version version) throws IOException {
|
||||
|
||||
// TODO: we should go and verify term vectors match, if
|
||||
// crossCheckTermVectors is on...
|
||||
|
@ -1765,7 +1764,7 @@ public final class CheckIndex implements Closeable {
|
|||
|
||||
final Fields fields = reader.getPostingsReader().getMergeInstance();
|
||||
final FieldInfos fieldInfos = reader.getFieldInfos();
|
||||
status = checkFields(fields, reader.getLiveDocs(), maxDoc, fieldInfos, true, false, infoStream, verbose);
|
||||
status = checkFields(fields, reader.getLiveDocs(), maxDoc, fieldInfos, true, false, infoStream, verbose, version);
|
||||
} catch (Throwable e) {
|
||||
if (failFast) {
|
||||
IOUtils.reThrow(e);
|
||||
|
@ -2339,15 +2338,15 @@ public final class CheckIndex implements Closeable {
|
|||
* Test term vectors.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public static Status.TermVectorStatus testTermVectors(CodecReader reader, PrintStream infoStream) throws IOException {
|
||||
return testTermVectors(reader, infoStream, false, false, false);
|
||||
public static Status.TermVectorStatus testTermVectors(CodecReader reader, PrintStream infoStream, Version version) throws IOException {
|
||||
return testTermVectors(reader, infoStream, false, false, false, version);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test term vectors.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public static Status.TermVectorStatus testTermVectors(CodecReader reader, PrintStream infoStream, boolean verbose, boolean crossCheckTermVectors, boolean failFast) throws IOException {
|
||||
public static Status.TermVectorStatus testTermVectors(CodecReader reader, PrintStream infoStream, boolean verbose, boolean crossCheckTermVectors, boolean failFast, Version version) throws IOException {
|
||||
long startNS = System.nanoTime();
|
||||
final Status.TermVectorStatus status = new Status.TermVectorStatus();
|
||||
final FieldInfos fieldInfos = reader.getFieldInfos();
|
||||
|
@ -2387,7 +2386,7 @@ public final class CheckIndex implements Closeable {
|
|||
|
||||
if (tfv != null) {
|
||||
// First run with no deletions:
|
||||
checkFields(tfv, null, 1, fieldInfos, false, true, infoStream, verbose);
|
||||
checkFields(tfv, null, 1, fieldInfos, false, true, infoStream, verbose, version);
|
||||
|
||||
// Only agg stats if the doc is live:
|
||||
final boolean doStats = liveDocs == null || liveDocs.get(j);
|
||||
|
|
|
@ -27,6 +27,7 @@ import java.util.Map;
|
|||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.NormsConsumer;
|
||||
|
@ -728,10 +729,6 @@ final class DefaultIndexingChain extends DocConsumer {
|
|||
|
||||
final boolean analyzed = fieldType.tokenized() && docState.analyzer != null;
|
||||
|
||||
// only bother checking offsets if something will consume them.
|
||||
// TODO: after we fix analyzers, also check if termVectorOffsets will be indexed.
|
||||
final boolean checkOffsets = indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
|
||||
|
||||
/*
|
||||
* To assist people in tracking down problems in analysis components, we wish to write the field name to the infostream
|
||||
* when we fail. We expect some caller to eventually deal with the real exception, so we don't want any 'catch' clauses,
|
||||
|
@ -743,6 +740,7 @@ final class DefaultIndexingChain extends DocConsumer {
|
|||
stream.reset();
|
||||
invertState.setAttributeSource(stream);
|
||||
termsHashPerField.start(field, first);
|
||||
CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
|
||||
|
||||
while (stream.incrementToken()) {
|
||||
|
||||
|
@ -771,15 +769,13 @@ final class DefaultIndexingChain extends DocConsumer {
|
|||
invertState.numOverlap++;
|
||||
}
|
||||
|
||||
if (checkOffsets) {
|
||||
int startOffset = invertState.offset + invertState.offsetAttribute.startOffset();
|
||||
int endOffset = invertState.offset + invertState.offsetAttribute.endOffset();
|
||||
if (startOffset < invertState.lastStartOffset || endOffset < startOffset) {
|
||||
throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, and offsets must not go backwards "
|
||||
+ "startOffset=" + startOffset + ",endOffset=" + endOffset + ",lastStartOffset=" + invertState.lastStartOffset + " for field '" + field.name() + "'");
|
||||
}
|
||||
invertState.lastStartOffset = startOffset;
|
||||
int startOffset = invertState.offset + invertState.offsetAttribute.startOffset();
|
||||
int endOffset = invertState.offset + invertState.offsetAttribute.endOffset();
|
||||
if (startOffset < invertState.lastStartOffset || endOffset < startOffset) {
|
||||
throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, and offsets must not go backwards "
|
||||
+ "startOffset=" + startOffset + ",endOffset=" + endOffset + ",lastStartOffset=" + invertState.lastStartOffset + " for field '" + field.name() + "'");
|
||||
}
|
||||
invertState.lastStartOffset = startOffset;
|
||||
|
||||
invertState.length++;
|
||||
if (invertState.length < 0) {
|
||||
|
|
|
@ -42,11 +42,6 @@ public class TestCheckIndex extends BaseTestCheckIndex {
|
|||
testDeletedDocs(directory);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBogusTermVectors() throws IOException {
|
||||
testBogusTermVectors(directory);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testChecksumsOnly() throws IOException {
|
||||
testChecksumsOnly(directory);
|
||||
|
|
|
@ -377,7 +377,7 @@ public class TokenSourcesTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
final BaseTermVectorsFormatTestCase.RandomTokenStream rTokenStream =
|
||||
new BaseTermVectorsFormatTestCase.RandomTokenStream(TestUtil.nextInt(random(), 1, 10), terms, termBytes, false);
|
||||
new BaseTermVectorsFormatTestCase.RandomTokenStream(TestUtil.nextInt(random(), 1, 10), terms, termBytes);
|
||||
//check to see if the token streams might have non-deterministic testable result
|
||||
final boolean storeTermVectorPositions = random().nextBoolean();
|
||||
final int[] startOffsets = rTokenStream.getStartOffsets();
|
||||
|
|
|
@ -45,6 +45,7 @@ import org.apache.lucene.index.RandomIndexWriter;
|
|||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.BitSetIterator;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
|
@ -431,7 +432,9 @@ public class TestTermAutomatonQuery extends LuceneTestCase {
|
|||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (synNext) {
|
||||
AttributeSource.State state = captureState();
|
||||
clearAttributes();
|
||||
restoreState(state);
|
||||
posIncAtt.setPositionIncrement(0);
|
||||
termAtt.append(""+((char) 97 + random().nextInt(3)));
|
||||
synNext = false;
|
||||
|
|
|
@ -200,10 +200,6 @@ public abstract class BaseTermVectorsFormatTestCase extends BaseIndexFileFormatT
|
|||
int i = 0;
|
||||
|
||||
public RandomTokenStream(int len, String[] sampleTerms, BytesRef[] sampleTermBytes) {
|
||||
this(len, sampleTerms, sampleTermBytes, rarely());
|
||||
}
|
||||
|
||||
public RandomTokenStream(int len, String[] sampleTerms, BytesRef[] sampleTermBytes, boolean offsetsGoBackwards) {
|
||||
terms = new String[len];
|
||||
termBytes = new BytesRef[len];
|
||||
positionsIncrements = new int[len];
|
||||
|
@ -216,17 +212,12 @@ public abstract class BaseTermVectorsFormatTestCase extends BaseIndexFileFormatT
|
|||
terms[i] = sampleTerms[o];
|
||||
termBytes[i] = sampleTermBytes[o];
|
||||
positionsIncrements[i] = TestUtil.nextInt(random(), i == 0 ? 1 : 0, 10);
|
||||
if (offsetsGoBackwards) {
|
||||
startOffsets[i] = random().nextInt();
|
||||
endOffsets[i] = random().nextInt();
|
||||
if (i == 0) {
|
||||
startOffsets[i] = TestUtil.nextInt(random(), 0, 1 << 16);
|
||||
} else {
|
||||
if (i == 0) {
|
||||
startOffsets[i] = TestUtil.nextInt(random(), 0, 1 << 16);
|
||||
} else {
|
||||
startOffsets[i] = startOffsets[i-1] + TestUtil.nextInt(random(), 0, rarely() ? 1 << 16 : 20);
|
||||
}
|
||||
endOffsets[i] = startOffsets[i] + TestUtil.nextInt(random(), 0, rarely() ? 1 << 10 : 20);
|
||||
startOffsets[i] = startOffsets[i-1] + TestUtil.nextInt(random(), 0, rarely() ? 1 << 16 : 20);
|
||||
}
|
||||
endOffsets[i] = startOffsets[i] + TestUtil.nextInt(random(), 0, rarely() ? 1 << 10 : 20);
|
||||
}
|
||||
|
||||
for (int i = 0; i < len; ++i) {
|
||||
|
|
|
@ -22,11 +22,8 @@ import java.io.PrintStream;
|
|||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.CannedTokenStream;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
@ -105,22 +102,6 @@ public class BaseTestCheckIndex extends LuceneTestCase {
|
|||
checker.close();
|
||||
}
|
||||
|
||||
// LUCENE-4221: we have to let these thru, for now
|
||||
public void testBogusTermVectors(Directory dir) throws IOException {
|
||||
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
|
||||
Document doc = new Document();
|
||||
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
ft.setStoreTermVectors(true);
|
||||
ft.setStoreTermVectorOffsets(true);
|
||||
Field field = new Field("foo", "", ft);
|
||||
field.setTokenStream(new CannedTokenStream(
|
||||
new Token("bar", 5, 10), new Token("bar", 1, 4)
|
||||
));
|
||||
doc.add(field);
|
||||
iw.addDocument(doc);
|
||||
iw.close();
|
||||
}
|
||||
|
||||
public void testChecksumsOnly(Directory dir) throws IOException {
|
||||
LineFileDocs lf = new LineFileDocs(random());
|
||||
MockAnalyzer analyzer = new MockAnalyzer(random());
|
||||
|
|
|
@ -334,9 +334,9 @@ public final class TestUtil {
|
|||
CheckIndex.testLiveDocs(codecReader, infoStream, true);
|
||||
CheckIndex.testFieldInfos(codecReader, infoStream, true);
|
||||
CheckIndex.testFieldNorms(codecReader, infoStream, true);
|
||||
CheckIndex.testPostings(codecReader, infoStream, false, true);
|
||||
CheckIndex.testPostings(codecReader, infoStream, false, true, Version.LUCENE_7_0_0);
|
||||
CheckIndex.testStoredFields(codecReader, infoStream, true);
|
||||
CheckIndex.testTermVectors(codecReader, infoStream, false, crossCheckTermVectors, true);
|
||||
CheckIndex.testTermVectors(codecReader, infoStream, false, crossCheckTermVectors, true, Version.LUCENE_7_0_0);
|
||||
CheckIndex.testDocValues(codecReader, infoStream, true);
|
||||
CheckIndex.testPoints(codecReader, infoStream, true);
|
||||
|
||||
|
|
|
@ -27,6 +27,7 @@ import java.util.Map;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
|
@ -284,6 +285,7 @@ public class PreAnalyzedField extends TextField implements HasImplicitIndexAnaly
|
|||
private byte[] binaryValue = null;
|
||||
private PreAnalyzedParser parser;
|
||||
private IOException readerConsumptionException;
|
||||
private int lastEndOffset;
|
||||
|
||||
public PreAnalyzedTokenizer(PreAnalyzedParser parser) {
|
||||
// we don't pack attributes: since we are used for (de)serialization and dont want bloat.
|
||||
|
@ -311,6 +313,8 @@ public class PreAnalyzedField extends TextField implements HasImplicitIndexAnaly
|
|||
|
||||
AttributeSource.State state = it.next();
|
||||
restoreState(state.clone());
|
||||
// TODO: why can't I lookup the OffsetAttribute up in ctor instead?
|
||||
lastEndOffset = addAttribute(OffsetAttribute.class).endOffset();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -329,6 +333,13 @@ public class PreAnalyzedField extends TextField implements HasImplicitIndexAnaly
|
|||
it = cachedStates.iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
super.end();
|
||||
// we must set the end offset correctly so multi-valued fields don't try to send offsets backwards:
|
||||
addAttribute(OffsetAttribute.class).setOffset(lastEndOffset, lastEndOffset);
|
||||
}
|
||||
|
||||
private void setReaderConsumptionException(IOException e) {
|
||||
readerConsumptionException = e;
|
||||
}
|
||||
|
|
|
@ -120,11 +120,6 @@ public class CheckHdfsIndexTest extends AbstractFullDistribZkTestBase {
|
|||
testCheckIndex.testDeletedDocs(directory);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBogusTermVectors() throws IOException {
|
||||
testCheckIndex.testBogusTermVectors(directory);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testChecksumsOnly() throws IOException {
|
||||
testCheckIndex.testChecksumsOnly(directory);
|
||||
|
|
Loading…
Reference in New Issue