LUCENE-6212: remove per-doc analyzers

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1656272 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2015-02-01 09:12:09 +00:00
parent b5db48c783
commit 1529c57ca1
25 changed files with 574 additions and 647 deletions

View File

@ -392,6 +392,11 @@ API Changes
Weight.scoresDocsOutOfOrder and LeafCollector.acceptsDocsOutOfOrder have been
removed and boolean queries now always score in order.
* LUCENE-6212: IndexWriter no longer accepts per-document Analyzer to
add/updateDocument. These methods were trappy as they made it
easy to accidentally index tokens that were not easily
searchable. (Mike McCandless)
Bug Fixes
* LUCENE-5650: Enforce read-only access to any path outside the temporary

View File

@ -75,7 +75,7 @@ public class UIMABaseAnalyzerTest extends BaseTokenStreamTestCase {
doc.add(new TextField("title", dummyTitle, Field.Store.YES));
String dummyContent = "there is some content written here";
doc.add(new TextField("contents", dummyContent, Field.Store.YES));
writer.addDocument(doc, analyzer);
writer.addDocument(doc);
writer.commit();
// try the search over the first doc
@ -96,7 +96,7 @@ public class UIMABaseAnalyzerTest extends BaseTokenStreamTestCase {
doc.add(new TextField("title", dogmasTitle, Field.Store.YES));
String dogmasContents = "white men can't jump";
doc.add(new TextField("contents", dogmasContents, Field.Store.YES));
writer.addDocument(doc, analyzer);
writer.addDocument(doc);
writer.commit();
directoryReader.close();

View File

@ -16,11 +16,15 @@
*/
package org.apache.lucene.classification;
import java.io.IOException;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
@ -32,9 +36,6 @@ import org.apache.lucene.util.TestUtil;
import org.junit.After;
import org.junit.Before;
import java.io.IOException;
import java.util.Random;
/**
* Base class for testing {@link Classifier}s
*/
@ -113,7 +114,7 @@ public abstract class ClassificationTestBase<T> extends LuceneTestCase {
assertEquals("got an assigned class of " + classificationResult.getAssignedClass(), expectedResult, classificationResult.getAssignedClass());
double score = classificationResult.getScore();
assertTrue("score should be between 0 and 1, got: " + score, score <= 1 && score >= 0);
updateSampleIndex(analyzer);
updateSampleIndex();
ClassificationResult<T> secondClassificationResult = classifier.assignClass(inputDoc);
assertEquals(classificationResult.getAssignedClass(), secondClassificationResult.getAssignedClass());
assertEquals(Double.valueOf(score), Double.valueOf(secondClassificationResult.getScore()));
@ -125,7 +126,8 @@ public abstract class ClassificationTestBase<T> extends LuceneTestCase {
}
private void populateSampleIndex(Analyzer analyzer) throws IOException {
indexWriter.deleteAll();
indexWriter.close();
indexWriter = new RandomIndexWriter(random(), dir, newIndexWriterConfig(analyzer).setOpenMode(IndexWriterConfig.OpenMode.CREATE));
indexWriter.commit();
String text;
@ -138,7 +140,7 @@ public abstract class ClassificationTestBase<T> extends LuceneTestCase {
doc.add(new Field(categoryFieldName, "politics", ft));
doc.add(new Field(booleanFieldName, "true", ft));
indexWriter.addDocument(doc, analyzer);
indexWriter.addDocument(doc);
doc = new Document();
text = "Mitt Romney seeks to assure Israel and Iran, as well as Jewish voters in the United" +
@ -146,7 +148,7 @@ public abstract class ClassificationTestBase<T> extends LuceneTestCase {
doc.add(new Field(textFieldName, text, ft));
doc.add(new Field(categoryFieldName, "politics", ft));
doc.add(new Field(booleanFieldName, "true", ft));
indexWriter.addDocument(doc, analyzer);
indexWriter.addDocument(doc);
doc = new Document();
text = "And there's a threshold question that he has to answer for the American people and " +
@ -155,7 +157,7 @@ public abstract class ClassificationTestBase<T> extends LuceneTestCase {
doc.add(new Field(textFieldName, text, ft));
doc.add(new Field(categoryFieldName, "politics", ft));
doc.add(new Field(booleanFieldName, "true", ft));
indexWriter.addDocument(doc, analyzer);
indexWriter.addDocument(doc);
doc = new Document();
text = "Still, when it comes to gun policy, many congressional Democrats have \"decided to " +
@ -164,7 +166,7 @@ public abstract class ClassificationTestBase<T> extends LuceneTestCase {
doc.add(new Field(textFieldName, text, ft));
doc.add(new Field(categoryFieldName, "politics", ft));
doc.add(new Field(booleanFieldName, "true", ft));
indexWriter.addDocument(doc, analyzer);
indexWriter.addDocument(doc);
doc = new Document();
text = "Standing amongst the thousands of people at the state Capitol, Jorstad, director of " +
@ -173,7 +175,7 @@ public abstract class ClassificationTestBase<T> extends LuceneTestCase {
doc.add(new Field(textFieldName, text, ft));
doc.add(new Field(categoryFieldName, "technology", ft));
doc.add(new Field(booleanFieldName, "false", ft));
indexWriter.addDocument(doc, analyzer);
indexWriter.addDocument(doc);
doc = new Document();
text = "So, about all those experts and analysts who've spent the past year or so saying " +
@ -181,7 +183,7 @@ public abstract class ClassificationTestBase<T> extends LuceneTestCase {
doc.add(new Field(textFieldName, text, ft));
doc.add(new Field(categoryFieldName, "technology", ft));
doc.add(new Field(booleanFieldName, "false", ft));
indexWriter.addDocument(doc, analyzer);
indexWriter.addDocument(doc);
doc = new Document();
text = "More than 400 million people trust Google with their e-mail, and 50 million store files" +
@ -190,12 +192,12 @@ public abstract class ClassificationTestBase<T> extends LuceneTestCase {
doc.add(new Field(textFieldName, text, ft));
doc.add(new Field(categoryFieldName, "technology", ft));
doc.add(new Field(booleanFieldName, "false", ft));
indexWriter.addDocument(doc, analyzer);
indexWriter.addDocument(doc);
doc = new Document();
text = "unlabeled doc";
doc.add(new Field(textFieldName, text, ft));
indexWriter.addDocument(doc, analyzer);
indexWriter.addDocument(doc);
indexWriter.commit();
}
@ -217,7 +219,8 @@ public abstract class ClassificationTestBase<T> extends LuceneTestCase {
}
private void populatePerformanceIndex(Analyzer analyzer) throws IOException {
indexWriter.deleteAll();
indexWriter.close();
indexWriter = new RandomIndexWriter(random(), dir, newIndexWriterConfig(analyzer).setOpenMode(IndexWriterConfig.OpenMode.CREATE));
indexWriter.commit();
FieldType ft = new FieldType(TextField.TYPE_STORED);
@ -232,7 +235,7 @@ public abstract class ClassificationTestBase<T> extends LuceneTestCase {
doc.add(new Field(textFieldName, createRandomString(random), ft));
doc.add(new Field(categoryFieldName, b ? "technology" : "politics", ft));
doc.add(new Field(booleanFieldName, String.valueOf(b), ft));
indexWriter.addDocument(doc, analyzer);
indexWriter.addDocument(doc);
}
indexWriter.commit();
}
@ -246,7 +249,7 @@ public abstract class ClassificationTestBase<T> extends LuceneTestCase {
return builder.toString();
}
private void updateSampleIndex(Analyzer analyzer) throws Exception {
private void updateSampleIndex() throws Exception {
String text;
@ -256,54 +259,54 @@ public abstract class ClassificationTestBase<T> extends LuceneTestCase {
doc.add(new Field(categoryFieldName, "politics", ft));
doc.add(new Field(booleanFieldName, "true", ft));
indexWriter.addDocument(doc, analyzer);
indexWriter.addDocument(doc);
doc = new Document();
text = "Julian Zelizer says Bill Clinton is still trying to shape his party, years after the White House, while George W. Bush opts for a much more passive role.";
doc.add(new Field(textFieldName, text, ft));
doc.add(new Field(categoryFieldName, "politics", ft));
doc.add(new Field(booleanFieldName, "true", ft));
indexWriter.addDocument(doc, analyzer);
indexWriter.addDocument(doc);
doc = new Document();
text = "Crossfire: Sen. Tim Scott passes on Sen. Lindsey Graham endorsement";
doc.add(new Field(textFieldName, text, ft));
doc.add(new Field(categoryFieldName, "politics", ft));
doc.add(new Field(booleanFieldName, "true", ft));
indexWriter.addDocument(doc, analyzer);
indexWriter.addDocument(doc);
doc = new Document();
text = "Illinois becomes 16th state to allow same-sex marriage.";
doc.add(new Field(textFieldName, text, ft));
doc.add(new Field(categoryFieldName, "politics", ft));
doc.add(new Field(booleanFieldName, "true", ft));
indexWriter.addDocument(doc, analyzer);
indexWriter.addDocument(doc);
doc = new Document();
text = "Apple is developing iPhones with curved-glass screens and enhanced sensors that detect different levels of pressure, according to a new report.";
doc.add(new Field(textFieldName, text, ft));
doc.add(new Field(categoryFieldName, "technology", ft));
doc.add(new Field(booleanFieldName, "false", ft));
indexWriter.addDocument(doc, analyzer);
indexWriter.addDocument(doc);
doc = new Document();
text = "The Xbox One is Microsoft's first new gaming console in eight years. It's a quality piece of hardware but it's also noteworthy because Microsoft is using it to make a statement.";
doc.add(new Field(textFieldName, text, ft));
doc.add(new Field(categoryFieldName, "technology", ft));
doc.add(new Field(booleanFieldName, "false", ft));
indexWriter.addDocument(doc, analyzer);
indexWriter.addDocument(doc);
doc = new Document();
text = "Google says it will replace a Google Maps image after a California father complained it shows the body of his teen-age son, who was shot to death in 2009.";
doc.add(new Field(textFieldName, text, ft));
doc.add(new Field(categoryFieldName, "technology", ft));
doc.add(new Field(booleanFieldName, "false", ft));
indexWriter.addDocument(doc, analyzer);
indexWriter.addDocument(doc);
doc = new Document();
text = "second unlabeled doc";
doc.add(new Field(textFieldName, text, ft));
indexWriter.addDocument(doc, analyzer);
indexWriter.addDocument(doc);
indexWriter.commit();
}

View File

@ -64,8 +64,6 @@ public class DataSplitterTest extends LuceneTestCase {
ft.setStoreTermVectorOffsets(true);
ft.setStoreTermVectorPositions(true);
Analyzer analyzer = new MockAnalyzer(random());
Document doc;
Random rnd = random();
for (int i = 0; i < 100; i++) {
@ -73,7 +71,7 @@ public class DataSplitterTest extends LuceneTestCase {
doc.add(new Field(idFieldName, Integer.toString(i), ft));
doc.add(new Field(textFieldName, TestUtil.randomUnicodeString(rnd, 1024), ft));
doc.add(new Field(classFieldName, TestUtil.randomUnicodeString(rnd, 10), ft));
indexWriter.addDocument(doc, analyzer);
indexWriter.addDocument(doc);
}
indexWriter.commit();

View File

@ -55,14 +55,12 @@ public class DocToDoubleVectorUtilsTest extends LuceneTestCase {
ft.setStoreTermVectorOffsets(true);
ft.setStoreTermVectorPositions(true);
Analyzer analyzer = new MockAnalyzer(random());
Document doc;
for (int i = 0; i < 10; i++) {
doc = new Document();
doc.add(new Field("id", Integer.toString(i), ft));
doc.add(new Field("text", random().nextInt(10) + " " + random().nextInt(10) + " " + random().nextInt(10), ft));
indexWriter.addDocument(doc, analyzer);
indexWriter.addDocument(doc);
}
indexWriter.commit();

View File

@ -1127,22 +1127,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
* @throws IOException if there is a low-level IO error
*/
public void addDocument(IndexDocument doc) throws IOException {
addDocument(doc, analyzer);
}
/**
* Adds a document to this index, using the provided analyzer instead of the
* value of {@link #getAnalyzer()}.
*
* <p>See {@link #addDocument(IndexDocument)} for details on
* index and IndexWriter state after an Exception, and
* flushing/merging temporary free space requirements.</p>
*
* @throws CorruptIndexException if the index is corrupt
* @throws IOException if there is a low-level IO error
*/
public void addDocument(IndexDocument doc, Analyzer analyzer) throws IOException {
updateDocument(null, doc, analyzer);
updateDocument(null, doc);
}
/**
@ -1183,22 +1168,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
* @lucene.experimental
*/
public void addDocuments(Iterable<? extends IndexDocument> docs) throws IOException {
addDocuments(docs, analyzer);
}
/**
* Atomically adds a block of documents, analyzed using the
* provided analyzer, with sequentially assigned document
* IDs, such that an external reader will see all or none
* of the documents.
*
* @throws CorruptIndexException if the index is corrupt
* @throws IOException if there is a low-level IO error
*
* @lucene.experimental
*/
public void addDocuments(Iterable<? extends IndexDocument> docs, Analyzer analyzer) throws IOException {
updateDocuments(null, docs, analyzer);
updateDocuments(null, docs);
}
/**
@ -1215,24 +1185,6 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
* @lucene.experimental
*/
public void updateDocuments(Term delTerm, Iterable<? extends IndexDocument> docs) throws IOException {
updateDocuments(delTerm, docs, analyzer);
}
/**
* Atomically deletes documents matching the provided
* delTerm and adds a block of documents, analyzed using
* the provided analyzer, with sequentially
* assigned document IDs, such that an external reader
* will see all or none of the documents.
*
* See {@link #addDocuments(Iterable)}.
*
* @throws CorruptIndexException if the index is corrupt
* @throws IOException if there is a low-level IO error
*
* @lucene.experimental
*/
public void updateDocuments(Term delTerm, Iterable<? extends IndexDocument> docs, Analyzer analyzer) throws IOException {
ensureOpen();
try {
boolean success = false;
@ -1384,26 +1336,6 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
*/
public void updateDocument(Term term, IndexDocument doc) throws IOException {
ensureOpen();
updateDocument(term, doc, analyzer);
}
/**
* Updates a document by first deleting the document(s)
* containing <code>term</code> and then adding the new
* document. The delete and then add are atomic as seen
* by a reader on the same index (flush may happen only after
* the add).
*
* @param term the term to identify the document(s) to be
* deleted
* @param doc the document to be added
* @param analyzer the analyzer to use when analyzing the document
* @throws CorruptIndexException if the index is corrupt
* @throws IOException if there is a low-level IO error
*/
public void updateDocument(Term term, IndexDocument doc, Analyzer analyzer)
throws IOException {
ensureOpen();
try {
boolean success = false;
try {

View File

@ -47,15 +47,6 @@ public class TrackingIndexWriter {
this.writer = writer;
}
/** Calls {@link
* IndexWriter#updateDocument(Term,IndexDocument,Analyzer)}
* and returns the generation that reflects this change. */
public long updateDocument(Term t, IndexDocument d, Analyzer a) throws IOException {
writer.updateDocument(t, d, a);
// Return gen as of when indexing finished:
return indexingGen.get();
}
/** Calls {@link
* IndexWriter#updateDocument(Term,IndexDocument)} and
* returns the generation that reflects this change. */
@ -65,15 +56,6 @@ public class TrackingIndexWriter {
return indexingGen.get();
}
/** Calls {@link
* IndexWriter#updateDocuments(Term,Iterable,Analyzer)}
* and returns the generation that reflects this change. */
public long updateDocuments(Term t, Iterable<? extends IndexDocument> docs, Analyzer a) throws IOException {
writer.updateDocuments(t, docs, a);
// Return gen as of when indexing finished:
return indexingGen.get();
}
/** Calls {@link
* IndexWriter#updateDocuments(Term,Iterable)} and returns
* the generation that reflects this change. */
@ -123,24 +105,6 @@ public class TrackingIndexWriter {
return indexingGen.get();
}
/** Calls {@link
* IndexWriter#addDocument(IndexDocument,Analyzer)} and
* returns the generation that reflects this change. */
public long addDocument(IndexDocument d, Analyzer a) throws IOException {
writer.addDocument(d, a);
// Return gen as of when indexing finished:
return indexingGen.get();
}
/** Calls {@link
* IndexWriter#addDocuments(Iterable,Analyzer)} and
* returns the generation that reflects this change. */
public long addDocuments(Iterable<? extends IndexDocument> docs, Analyzer a) throws IOException {
writer.addDocuments(docs, a);
// Return gen as of when indexing finished:
return indexingGen.get();
}
/** Calls {@link IndexWriter#addDocument(IndexDocument)}
* and returns the generation that reflects this change. */
public long addDocument(IndexDocument d) throws IOException {

View File

@ -305,7 +305,7 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
}
};
final RandomIndexWriter writer = new RandomIndexWriter(random(), newDirectory());
final RandomIndexWriter writer = new RandomIndexWriter(random(), newDirectory(), a);
final Document doc = new Document();
final FieldType ft = new FieldType();
ft.setIndexOptions(IndexOptions.DOCS);
@ -315,7 +315,7 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
ft.setStoreTermVectorOffsets(true);
doc.add(new Field("f", "a", ft));
doc.add(new Field("f", "a", ft));
writer.addDocument(doc, a);
writer.addDocument(doc);
final LeafReader reader = getOnlySegmentReader(writer.getReader());
final Fields fields = reader.getTermVectors(0);
final Terms terms = fields.terms("f");

View File

@ -392,27 +392,35 @@ public class TestIndexWriterExceptions extends LuceneTestCase {
// LUCENE-1208
public void testExceptionJustBeforeFlush() throws IOException {
Directory dir = newDirectory();
IndexWriter w = RandomIndexWriter.mockIndexWriter(dir,
newIndexWriterConfig(new MockAnalyzer(random()))
.setMaxBufferedDocs(2),
new TestPoint1());
Document doc = new Document();
doc.add(newTextField("field", "a field", Field.Store.YES));
w.addDocument(doc);
final AtomicBoolean doCrash = new AtomicBoolean();
Analyzer analyzer = new Analyzer(Analyzer.PER_FIELD_REUSE_STRATEGY) {
@Override
public TokenStreamComponents createComponents(String fieldName) {
MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
tokenizer.setEnableChecks(false); // disable workflow checking as we forcefully close() in exceptional cases.
return new TokenStreamComponents(tokenizer, new CrashingFilter(fieldName, tokenizer));
TokenStream stream = tokenizer;
if (doCrash.get()) {
stream = new CrashingFilter(fieldName, stream);
}
return new TokenStreamComponents(tokenizer, stream);
}
};
IndexWriter w = RandomIndexWriter.mockIndexWriter(dir,
newIndexWriterConfig(analyzer)
.setMaxBufferedDocs(2),
new TestPoint1());
Document doc = new Document();
doc.add(newTextField("field", "a field", Field.Store.YES));
w.addDocument(doc);
Document crashDoc = new Document();
crashDoc.add(newTextField("crash", "do it on token 4", Field.Store.YES));
doCrash.set(true);
try {
w.addDocument(crashDoc, analyzer);
w.addDocument(crashDoc);
fail("did not hit expected exception");
} catch (IOException ioe) {
// expected

View File

@ -42,521 +42,520 @@ import org.apache.lucene.util.TestUtil;
public class TestPayloads extends LuceneTestCase {
// Simple tests to test the Payload class
public void testPayload() throws Exception {
BytesRef payload = new BytesRef("This is a test!");
assertEquals("Wrong payload length.", "This is a test!".length(), payload.length);
// Simple tests to test the Payload class
public void testPayload() throws Exception {
BytesRef payload = new BytesRef("This is a test!");
assertEquals("Wrong payload length.", "This is a test!".length(), payload.length);
BytesRef clone = payload.clone();
assertEquals(payload.length, clone.length);
for (int i = 0; i < payload.length; i++) {
assertEquals(payload.bytes[i + payload.offset], clone.bytes[i + clone.offset]);
}
BytesRef clone = payload.clone();
assertEquals(payload.length, clone.length);
for (int i = 0; i < payload.length; i++) {
assertEquals(payload.bytes[i + payload.offset], clone.bytes[i + clone.offset]);
}
}
}
// Tests whether the DocumentWriter and SegmentMerger correctly enable the
// payload bit in the FieldInfo
public void testPayloadFieldBit() throws Exception {
Directory ram = newDirectory();
PayloadAnalyzer analyzer = new PayloadAnalyzer();
IndexWriter writer = new IndexWriter(ram, newIndexWriterConfig(analyzer));
Document d = new Document();
// this field won't have any payloads
d.add(newTextField("f1", "This field has no payloads", Field.Store.NO));
// this field will have payloads in all docs, however not for all term positions,
// so this field is used to check if the DocumentWriter correctly enables the payloads bit
// even if only some term positions have payloads
d.add(newTextField("f2", "This field has payloads in all docs", Field.Store.NO));
d.add(newTextField("f2", "This field has payloads in all docs NO PAYLOAD", Field.Store.NO));
// this field is used to verify if the SegmentMerger enables payloads for a field if it has payloads
// enabled in only some documents
d.add(newTextField("f3", "This field has payloads in some docs", Field.Store.NO));
// only add payload data for field f2
analyzer.setPayloadData("f2", "somedata".getBytes(StandardCharsets.UTF_8), 0, 1);
writer.addDocument(d);
// flush
writer.close();
// Tests whether the DocumentWriter and SegmentMerger correctly enable the
// payload bit in the FieldInfo
public void testPayloadFieldBit() throws Exception {
Directory ram = newDirectory();
PayloadAnalyzer analyzer = new PayloadAnalyzer();
IndexWriter writer = new IndexWriter(ram, newIndexWriterConfig(analyzer));
Document d = new Document();
// this field won't have any payloads
d.add(newTextField("f1", "This field has no payloads", Field.Store.NO));
// this field will have payloads in all docs, however not for all term positions,
// so this field is used to check if the DocumentWriter correctly enables the payloads bit
// even if only some term positions have payloads
d.add(newTextField("f2", "This field has payloads in all docs", Field.Store.NO));
d.add(newTextField("f2", "This field has payloads in all docs NO PAYLOAD", Field.Store.NO));
// this field is used to verify if the SegmentMerger enables payloads for a field if it has payloads
// enabled in only some documents
d.add(newTextField("f3", "This field has payloads in some docs", Field.Store.NO));
// only add payload data for field f2
analyzer.setPayloadData("f2", "somedata".getBytes(StandardCharsets.UTF_8), 0, 1);
writer.addDocument(d);
// flush
writer.close();
SegmentReader reader = getOnlySegmentReader(DirectoryReader.open(ram));
FieldInfos fi = reader.getFieldInfos();
assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").hasPayloads());
assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").hasPayloads());
assertFalse("Payload field bit should not be set.", fi.fieldInfo("f3").hasPayloads());
reader.close();
SegmentReader reader = getOnlySegmentReader(DirectoryReader.open(ram));
FieldInfos fi = reader.getFieldInfos();
assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").hasPayloads());
assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").hasPayloads());
assertFalse("Payload field bit should not be set.", fi.fieldInfo("f3").hasPayloads());
reader.close();
// now we add another document which has payloads for field f3 and verify if the SegmentMerger
// enabled payloads for that field
analyzer = new PayloadAnalyzer(); // Clear payload state for each field
writer = new IndexWriter(ram, newIndexWriterConfig(analyzer)
.setOpenMode(OpenMode.CREATE));
d = new Document();
d.add(newTextField("f1", "This field has no payloads", Field.Store.NO));
d.add(newTextField("f2", "This field has payloads in all docs", Field.Store.NO));
d.add(newTextField("f2", "This field has payloads in all docs", Field.Store.NO));
d.add(newTextField("f3", "This field has payloads in some docs", Field.Store.NO));
// add payload data for field f2 and f3
analyzer.setPayloadData("f2", "somedata".getBytes(StandardCharsets.UTF_8), 0, 1);
analyzer.setPayloadData("f3", "somedata".getBytes(StandardCharsets.UTF_8), 0, 3);
writer.addDocument(d);
// now we add another document which has payloads for field f3 and verify if the SegmentMerger
// enabled payloads for that field
analyzer = new PayloadAnalyzer(); // Clear payload state for each field
writer = new IndexWriter(ram, newIndexWriterConfig(analyzer)
.setOpenMode(OpenMode.CREATE));
d = new Document();
d.add(newTextField("f1", "This field has no payloads", Field.Store.NO));
d.add(newTextField("f2", "This field has payloads in all docs", Field.Store.NO));
d.add(newTextField("f2", "This field has payloads in all docs", Field.Store.NO));
d.add(newTextField("f3", "This field has payloads in some docs", Field.Store.NO));
// add payload data for field f2 and f3
analyzer.setPayloadData("f2", "somedata".getBytes(StandardCharsets.UTF_8), 0, 1);
analyzer.setPayloadData("f3", "somedata".getBytes(StandardCharsets.UTF_8), 0, 3);
writer.addDocument(d);
// force merge
writer.forceMerge(1);
// flush
writer.close();
// force merge
writer.forceMerge(1);
// flush
writer.close();
reader = getOnlySegmentReader(DirectoryReader.open(ram));
fi = reader.getFieldInfos();
assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").hasPayloads());
assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").hasPayloads());
assertTrue("Payload field bit should be set.", fi.fieldInfo("f3").hasPayloads());
reader.close();
ram.close();
}
reader = getOnlySegmentReader(DirectoryReader.open(ram));
fi = reader.getFieldInfos();
assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").hasPayloads());
assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").hasPayloads());
assertTrue("Payload field bit should be set.", fi.fieldInfo("f3").hasPayloads());
reader.close();
ram.close();
}
// Tests if payloads are correctly stored and loaded using both RamDirectory and FSDirectory
public void testPayloadsEncoding() throws Exception {
Directory dir = newDirectory();
performTest(dir);
dir.close();
}
// Tests if payloads are correctly stored and loaded using both RamDirectory and FSDirectory
public void testPayloadsEncoding() throws Exception {
Directory dir = newDirectory();
performTest(dir);
dir.close();
}
// builds an index with payloads in the given Directory and performs
// different tests to verify the payload encoding
private void performTest(Directory dir) throws Exception {
PayloadAnalyzer analyzer = new PayloadAnalyzer();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer)
.setOpenMode(OpenMode.CREATE)
.setMergePolicy(newLogMergePolicy()));
// builds an index with payloads in the given Directory and performs
// different tests to verify the payload encoding
private void performTest(Directory dir) throws Exception {
PayloadAnalyzer analyzer = new PayloadAnalyzer();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer)
.setOpenMode(OpenMode.CREATE)
.setMergePolicy(newLogMergePolicy()));
// should be in sync with value in TermInfosWriter
final int skipInterval = 16;
// should be in sync with value in TermInfosWriter
final int skipInterval = 16;
final int numTerms = 5;
final String fieldName = "f1";
final int numTerms = 5;
final String fieldName = "f1";
int numDocs = skipInterval + 1;
// create content for the test documents with just a few terms
Term[] terms = generateTerms(fieldName, numTerms);
StringBuilder sb = new StringBuilder();
for (int i = 0; i < terms.length; i++) {
sb.append(terms[i].text());
sb.append(" ");
}
String content = sb.toString();
int numDocs = skipInterval + 1;
// create content for the test documents with just a few terms
Term[] terms = generateTerms(fieldName, numTerms);
StringBuilder sb = new StringBuilder();
for (int i = 0; i < terms.length; i++) {
sb.append(terms[i].text());
sb.append(" ");
}
String content = sb.toString();
int payloadDataLength = numTerms * numDocs * 2 + numTerms * numDocs * (numDocs - 1) / 2;
byte[] payloadData = generateRandomData(payloadDataLength);
int payloadDataLength = numTerms * numDocs * 2 + numTerms * numDocs * (numDocs - 1) / 2;
byte[] payloadData = generateRandomData(payloadDataLength);
Document d = new Document();
d.add(newTextField(fieldName, content, Field.Store.NO));
// add the same document multiple times to have the same payload lengths for all
// occurrences within two consecutive skip intervals
int offset = 0;
for (int i = 0; i < 2 * numDocs; i++) {
analyzer = new PayloadAnalyzer(fieldName, payloadData, offset, 1);
offset += numTerms;
writer.addDocument(d, analyzer);
}
Document d = new Document();
d.add(newTextField(fieldName, content, Field.Store.NO));
// add the same document multiple times to have the same payload lengths for all
// occurrences within two consecutive skip intervals
int offset = 0;
for (int i = 0; i < 2 * numDocs; i++) {
analyzer.setPayloadData(fieldName, payloadData, offset, 1);
offset += numTerms;
writer.addDocument(d);
}
// make sure we create more than one segment to test merging
writer.commit();
// make sure we create more than one segment to test merging
writer.commit();
// now we make sure to have different payload lengths next at the next skip point
for (int i = 0; i < numDocs; i++) {
analyzer = new PayloadAnalyzer(fieldName, payloadData, offset, i);
offset += i * numTerms;
writer.addDocument(d, analyzer);
}
// now we make sure to have different payload lengths next at the next skip point
for (int i = 0; i < numDocs; i++) {
analyzer.setPayloadData(fieldName, payloadData, offset, i);
offset += i * numTerms;
writer.addDocument(d);
}
writer.forceMerge(1);
// flush
writer.close();
writer.forceMerge(1);
// flush
writer.close();
/*
* Verify the index
* first we test if all payloads are stored correctly
*/
IndexReader reader = DirectoryReader.open(dir);
/*
* Verify the index
* first we test if all payloads are stored correctly
*/
IndexReader reader = DirectoryReader.open(dir);
byte[] verifyPayloadData = new byte[payloadDataLength];
offset = 0;
DocsAndPositionsEnum[] tps = new DocsAndPositionsEnum[numTerms];
for (int i = 0; i < numTerms; i++) {
tps[i] = MultiFields.getTermPositionsEnum(reader,
MultiFields.getLiveDocs(reader),
terms[i].field(),
new BytesRef(terms[i].text()));
}
byte[] verifyPayloadData = new byte[payloadDataLength];
offset = 0;
DocsAndPositionsEnum[] tps = new DocsAndPositionsEnum[numTerms];
for (int i = 0; i < numTerms; i++) {
tps[i] = MultiFields.getTermPositionsEnum(reader,
MultiFields.getLiveDocs(reader),
terms[i].field(),
new BytesRef(terms[i].text()));
}
while (tps[0].nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
for (int i = 1; i < numTerms; i++) {
tps[i].nextDoc();
}
int freq = tps[0].freq();
while (tps[0].nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
for (int i = 1; i < numTerms; i++) {
tps[i].nextDoc();
}
int freq = tps[0].freq();
for (int i = 0; i < freq; i++) {
for (int j = 0; j < numTerms; j++) {
tps[j].nextPosition();
BytesRef br = tps[j].getPayload();
if (br != null) {
System.arraycopy(br.bytes, br.offset, verifyPayloadData, offset, br.length);
offset += br.length;
}
}
}
for (int i = 0; i < freq; i++) {
for (int j = 0; j < numTerms; j++) {
tps[j].nextPosition();
BytesRef br = tps[j].getPayload();
if (br != null) {
System.arraycopy(br.bytes, br.offset, verifyPayloadData, offset, br.length);
offset += br.length;
}
}
}
}
assertByteArrayEquals(payloadData, verifyPayloadData);
assertByteArrayEquals(payloadData, verifyPayloadData);
/*
* test lazy skipping
*/
DocsAndPositionsEnum tp = MultiFields.getTermPositionsEnum(reader,
MultiFields.getLiveDocs(reader),
terms[0].field(),
new BytesRef(terms[0].text()));
tp.nextDoc();
tp.nextPosition();
// NOTE: prior rev of this test was failing to first
// call next here:
tp.nextDoc();
// now we don't read this payload
tp.nextPosition();
BytesRef payload = tp.getPayload();
assertEquals("Wrong payload length.", 1, payload.length);
assertEquals(payload.bytes[payload.offset], payloadData[numTerms]);
tp.nextDoc();
tp.nextPosition();
/*
* test lazy skipping
*/
DocsAndPositionsEnum tp = MultiFields.getTermPositionsEnum(reader,
MultiFields.getLiveDocs(reader),
terms[0].field(),
new BytesRef(terms[0].text()));
tp.nextDoc();
tp.nextPosition();
// NOTE: prior rev of this test was failing to first
// call next here:
tp.nextDoc();
// now we don't read this payload
tp.nextPosition();
BytesRef payload = tp.getPayload();
assertEquals("Wrong payload length.", 1, payload.length);
assertEquals(payload.bytes[payload.offset], payloadData[numTerms]);
tp.nextDoc();
tp.nextPosition();
// we don't read this payload and skip to a different document
tp.advance(5);
tp.nextPosition();
payload = tp.getPayload();
assertEquals("Wrong payload length.", 1, payload.length);
assertEquals(payload.bytes[payload.offset], payloadData[5 * numTerms]);
// we don't read this payload and skip to a different document
tp.advance(5);
tp.nextPosition();
payload = tp.getPayload();
assertEquals("Wrong payload length.", 1, payload.length);
assertEquals(payload.bytes[payload.offset], payloadData[5 * numTerms]);
/*
* Test different lengths at skip points
*/
tp = MultiFields.getTermPositionsEnum(reader,
MultiFields.getLiveDocs(reader),
terms[1].field(),
new BytesRef(terms[1].text()));
tp.nextDoc();
tp.nextPosition();
assertEquals("Wrong payload length.", 1, tp.getPayload().length);
tp.advance(skipInterval - 1);
tp.nextPosition();
assertEquals("Wrong payload length.", 1, tp.getPayload().length);
tp.advance(2 * skipInterval - 1);
tp.nextPosition();
assertEquals("Wrong payload length.", 1, tp.getPayload().length);
tp.advance(3 * skipInterval - 1);
tp.nextPosition();
assertEquals("Wrong payload length.", 3 * skipInterval - 2 * numDocs - 1, tp.getPayload().length);
/*
* Test different lengths at skip points
*/
tp = MultiFields.getTermPositionsEnum(reader,
MultiFields.getLiveDocs(reader),
terms[1].field(),
new BytesRef(terms[1].text()));
tp.nextDoc();
tp.nextPosition();
assertEquals("Wrong payload length.", 1, tp.getPayload().length);
tp.advance(skipInterval - 1);
tp.nextPosition();
assertEquals("Wrong payload length.", 1, tp.getPayload().length);
tp.advance(2 * skipInterval - 1);
tp.nextPosition();
assertEquals("Wrong payload length.", 1, tp.getPayload().length);
tp.advance(3 * skipInterval - 1);
tp.nextPosition();
assertEquals("Wrong payload length.", 3 * skipInterval - 2 * numDocs - 1, tp.getPayload().length);
reader.close();
reader.close();
// test long payload
analyzer = new PayloadAnalyzer();
writer = new IndexWriter(dir, newIndexWriterConfig(analyzer)
.setOpenMode(OpenMode.CREATE));
String singleTerm = "lucene";
// test long payload
analyzer = new PayloadAnalyzer();
writer = new IndexWriter(dir, newIndexWriterConfig(analyzer)
.setOpenMode(OpenMode.CREATE));
String singleTerm = "lucene";
d = new Document();
d.add(newTextField(fieldName, singleTerm, Field.Store.NO));
// add a payload whose length is greater than the buffer size of BufferedIndexOutput
payloadData = generateRandomData(2000);
analyzer.setPayloadData(fieldName, payloadData, 100, 1500);
writer.addDocument(d);
d = new Document();
d.add(newTextField(fieldName, singleTerm, Field.Store.NO));
// add a payload whose length is greater than the buffer size of BufferedIndexOutput
payloadData = generateRandomData(2000);
analyzer.setPayloadData(fieldName, payloadData, 100, 1500);
writer.addDocument(d);
writer.forceMerge(1);
// flush
writer.close();
writer.forceMerge(1);
// flush
writer.close();
reader = DirectoryReader.open(dir);
tp = MultiFields.getTermPositionsEnum(reader,
MultiFields.getLiveDocs(reader),
fieldName,
new BytesRef(singleTerm));
tp.nextDoc();
tp.nextPosition();
reader = DirectoryReader.open(dir);
tp = MultiFields.getTermPositionsEnum(reader,
MultiFields.getLiveDocs(reader),
fieldName,
new BytesRef(singleTerm));
tp.nextDoc();
tp.nextPosition();
BytesRef br = tp.getPayload();
verifyPayloadData = new byte[br.length];
byte[] portion = new byte[1500];
System.arraycopy(payloadData, 100, portion, 0, 1500);
BytesRef br = tp.getPayload();
verifyPayloadData = new byte[br.length];
byte[] portion = new byte[1500];
System.arraycopy(payloadData, 100, portion, 0, 1500);
assertByteArrayEquals(portion, br.bytes, br.offset, br.length);
reader.close();
assertByteArrayEquals(portion, br.bytes, br.offset, br.length);
reader.close();
}
}
static final Charset utf8 = StandardCharsets.UTF_8;
static final Charset utf8 = StandardCharsets.UTF_8;
private void generateRandomData(byte[] data) {
// this test needs the random data to be valid unicode
String s = TestUtil.randomFixedByteLengthUnicodeString(random(), data.length);
byte b[] = s.getBytes(utf8);
assert b.length == data.length;
System.arraycopy(b, 0, data, 0, b.length);
}
private void generateRandomData(byte[] data) {
// this test needs the random data to be valid unicode
String s = TestUtil.randomFixedByteLengthUnicodeString(random(), data.length);
byte b[] = s.getBytes(utf8);
assert b.length == data.length;
System.arraycopy(b, 0, data, 0, b.length);
}
private byte[] generateRandomData(int n) {
byte[] data = new byte[n];
generateRandomData(data);
return data;
}
private byte[] generateRandomData(int n) {
byte[] data = new byte[n];
generateRandomData(data);
return data;
}
private Term[] generateTerms(String fieldName, int n) {
int maxDigits = (int) (Math.log(n) / Math.log(10));
Term[] terms = new Term[n];
StringBuilder sb = new StringBuilder();
for (int i = 0; i < n; i++) {
sb.setLength(0);
sb.append("t");
int zeros = maxDigits - (int) (Math.log(i) / Math.log(10));
for (int j = 0; j < zeros; j++) {
sb.append("0");
}
sb.append(i);
terms[i] = new Term(fieldName, sb.toString());
}
return terms;
private Term[] generateTerms(String fieldName, int n) {
int maxDigits = (int) (Math.log(n) / Math.log(10));
Term[] terms = new Term[n];
StringBuilder sb = new StringBuilder();
for (int i = 0; i < n; i++) {
sb.setLength(0);
sb.append("t");
int zeros = maxDigits - (int) (Math.log(i) / Math.log(10));
for (int j = 0; j < zeros; j++) {
sb.append("0");
}
sb.append(i);
terms[i] = new Term(fieldName, sb.toString());
}
return terms;
}
void assertByteArrayEquals(byte[] b1, byte[] b2) {
if (b1.length != b2.length) {
fail("Byte arrays have different lengths: " + b1.length + ", " + b2.length);
}
void assertByteArrayEquals(byte[] b1, byte[] b2) {
if (b1.length != b2.length) {
fail("Byte arrays have different lengths: " + b1.length + ", " + b2.length);
}
for (int i = 0; i < b1.length; i++) {
if (b1[i] != b2[i]) {
fail("Byte arrays different at index " + i + ": " + b1[i] + ", " + b2[i]);
}
}
}
for (int i = 0; i < b1.length; i++) {
if (b1[i] != b2[i]) {
fail("Byte arrays different at index " + i + ": " + b1[i] + ", " + b2[i]);
}
}
}
void assertByteArrayEquals(byte[] b1, byte[] b2, int b2offset, int b2length) {
if (b1.length != b2length) {
fail("Byte arrays have different lengths: " + b1.length + ", " + b2length);
}
if (b1.length != b2length) {
fail("Byte arrays have different lengths: " + b1.length + ", " + b2length);
}
for (int i = 0; i < b1.length; i++) {
if (b1[i] != b2[b2offset+i]) {
fail("Byte arrays different at index " + i + ": " + b1[i] + ", " + b2[b2offset+i]);
}
}
}
for (int i = 0; i < b1.length; i++) {
if (b1[i] != b2[b2offset+i]) {
fail("Byte arrays different at index " + i + ": " + b1[i] + ", " + b2[b2offset+i]);
}
}
}
/**
* This Analyzer uses an WhitespaceTokenizer and PayloadFilter.
*/
private static class PayloadAnalyzer extends Analyzer {
Map<String,PayloadData> fieldToData = new HashMap<>();
static class PayloadData {
byte[] data;
int offset;
int length;
public PayloadAnalyzer() {
super(PER_FIELD_REUSE_STRATEGY);
}
public PayloadAnalyzer(String field, byte[] data, int offset, int length) {
super(PER_FIELD_REUSE_STRATEGY);
setPayloadData(field, data, offset, length);
}
PayloadData(byte[] data, int offset, int length) {
this.data = data;
this.offset = offset;
this.length = length;
}
}
void setPayloadData(String field, byte[] data, int offset, int length) {
fieldToData.put(field, new PayloadData(data, offset, length));
}
@Override
public TokenStreamComponents createComponents(String fieldName) {
PayloadData payload = fieldToData.get(fieldName);
Tokenizer ts = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenStream tokenStream = (payload != null) ?
new PayloadFilter(ts, payload.data, payload.offset, payload.length) : ts;
return new TokenStreamComponents(ts, tokenStream);
}
private static class PayloadData {
byte[] data;
int offset;
int length;
/**
* This Analyzer uses an MockTokenizer and PayloadFilter.
*/
private static class PayloadAnalyzer extends Analyzer {
Map<String,PayloadData> fieldToData = new HashMap<>();
PayloadData(byte[] data, int offset, int length) {
this.data = data;
this.offset = offset;
this.length = length;
}
}
public PayloadAnalyzer() {
super(PER_FIELD_REUSE_STRATEGY);
}
public PayloadAnalyzer(String field, byte[] data, int offset, int length) {
super(PER_FIELD_REUSE_STRATEGY);
setPayloadData(field, data, offset, length);
}
void setPayloadData(String field, byte[] data, int offset, int length) {
fieldToData.put(field, new PayloadData(data, offset, length));
}
@Override
public TokenStreamComponents createComponents(String fieldName) {
PayloadData payload = fieldToData.get(fieldName);
Tokenizer ts = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenStream tokenStream = (payload != null) ?
new PayloadFilter(ts, fieldName, fieldToData) : ts;
return new TokenStreamComponents(ts, tokenStream);
}
}
/**
* This Filter adds payloads to the tokens.
*/
private static class PayloadFilter extends TokenFilter {
private byte[] data;
private int length;
private int offset;
private int startOffset;
PayloadAttribute payloadAtt;
CharTermAttribute termAttribute;
/**
* This Filter adds payloads to the tokens.
*/
private static class PayloadFilter extends TokenFilter {
PayloadAttribute payloadAtt;
CharTermAttribute termAttribute;
private Map<String,PayloadData> fieldToData;
private String fieldName;
private PayloadData payloadData;
private int offset;
public PayloadFilter(TokenStream in, byte[] data, int offset, int length) {
super(in);
this.data = data;
this.length = length;
this.offset = offset;
this.startOffset = offset;
payloadAtt = addAttribute(PayloadAttribute.class);
termAttribute = addAttribute(CharTermAttribute.class);
}
public PayloadFilter(TokenStream in, String fieldName, Map<String,PayloadData> fieldToData) {
super(in);
this.fieldToData = fieldToData;
this.fieldName = fieldName;
payloadAtt = addAttribute(PayloadAttribute.class);
termAttribute = addAttribute(CharTermAttribute.class);
}
@Override
public boolean incrementToken() throws IOException {
boolean hasNext = input.incrementToken();
if (!hasNext) {
return false;
}
@Override
public boolean incrementToken() throws IOException {
boolean hasNext = input.incrementToken();
if (!hasNext) {
return false;
}
// Some values of the same field are to have payloads and others not
if (offset + length <= data.length && !termAttribute.toString().endsWith("NO PAYLOAD")) {
BytesRef p = new BytesRef(data, offset, length);
payloadAtt.setPayload(p);
offset += length;
} else {
payloadAtt.setPayload(null);
}
// Some values of the same field are to have payloads and others not
if (offset + payloadData.length <= payloadData.data.length && !termAttribute.toString().endsWith("NO PAYLOAD")) {
BytesRef p = new BytesRef(payloadData.data, offset, payloadData.length);
payloadAtt.setPayload(p);
offset += payloadData.length;
} else {
payloadAtt.setPayload(null);
}
return true;
return true;
}
@Override
public void reset() throws IOException {
super.reset();
this.payloadData = fieldToData.get(fieldName);
this.offset = payloadData.offset;
}
}
public void testThreadSafety() throws Exception {
final int numThreads = 5;
final int numDocs = atLeast(50);
final ByteArrayPool pool = new ByteArrayPool(numThreads, 5);
Directory dir = newDirectory();
final IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
final String field = "test";
Thread[] ingesters = new Thread[numThreads];
for (int i = 0; i < numThreads; i++) {
ingesters[i] = new Thread() {
@Override
public void run() {
try {
for (int j = 0; j < numDocs; j++) {
Document d = new Document();
d.add(new TextField(field, new PoolingPayloadTokenStream(pool)));
writer.addDocument(d);
}
} catch (Exception e) {
e.printStackTrace();
fail(e.toString());
}
}
};
ingesters[i].start();
}
for (int i = 0; i < numThreads; i++) {
ingesters[i].join();
}
writer.close();
IndexReader reader = DirectoryReader.open(dir);
TermsEnum terms = MultiFields.getFields(reader).terms(field).iterator(null);
Bits liveDocs = MultiFields.getLiveDocs(reader);
DocsAndPositionsEnum tp = null;
while (terms.next() != null) {
String termText = terms.term().utf8ToString();
tp = terms.docsAndPositions(liveDocs, tp);
while(tp.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
int freq = tp.freq();
for (int i = 0; i < freq; i++) {
tp.nextPosition();
final BytesRef payload = tp.getPayload();
assertEquals(termText, payload.utf8ToString());
}
}
}
reader.close();
dir.close();
assertEquals(pool.size(), numThreads);
}
private class PoolingPayloadTokenStream extends TokenStream {
private byte[] payload;
private boolean first;
private ByteArrayPool pool;
private String term;
@Override
public void reset() throws IOException {
super.reset();
this.offset = startOffset;
CharTermAttribute termAtt;
PayloadAttribute payloadAtt;
PoolingPayloadTokenStream(ByteArrayPool pool) {
this.pool = pool;
payload = pool.get();
generateRandomData(payload);
term = new String(payload, 0, payload.length, utf8);
first = true;
payloadAtt = addAttribute(PayloadAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
}
@Override
public boolean incrementToken() throws IOException {
if (!first) return false;
first = false;
clearAttributes();
termAtt.append(term);
payloadAtt.setPayload(new BytesRef(payload));
return true;
}
@Override
public void close() throws IOException {
pool.release(payload);
}
}
private static class ByteArrayPool {
private List<byte[]> pool;
ByteArrayPool(int capacity, int size) {
pool = new ArrayList<>();
for (int i = 0; i < capacity; i++) {
pool.add(new byte[size]);
}
}
public void testThreadSafety() throws Exception {
final int numThreads = 5;
final int numDocs = atLeast(50);
final ByteArrayPool pool = new ByteArrayPool(numThreads, 5);
Directory dir = newDirectory();
final IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
final String field = "test";
Thread[] ingesters = new Thread[numThreads];
for (int i = 0; i < numThreads; i++) {
ingesters[i] = new Thread() {
@Override
public void run() {
try {
for (int j = 0; j < numDocs; j++) {
Document d = new Document();
d.add(new TextField(field, new PoolingPayloadTokenStream(pool)));
writer.addDocument(d);
}
} catch (Exception e) {
e.printStackTrace();
fail(e.toString());
}
}
};
ingesters[i].start();
}
for (int i = 0; i < numThreads; i++) {
ingesters[i].join();
}
writer.close();
IndexReader reader = DirectoryReader.open(dir);
TermsEnum terms = MultiFields.getFields(reader).terms(field).iterator(null);
Bits liveDocs = MultiFields.getLiveDocs(reader);
DocsAndPositionsEnum tp = null;
while (terms.next() != null) {
String termText = terms.term().utf8ToString();
tp = terms.docsAndPositions(liveDocs, tp);
while(tp.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
int freq = tp.freq();
for (int i = 0; i < freq; i++) {
tp.nextPosition();
final BytesRef payload = tp.getPayload();
assertEquals(termText, payload.utf8ToString());
}
}
}
reader.close();
dir.close();
assertEquals(pool.size(), numThreads);
synchronized byte[] get() {
return pool.remove(0);
}
private class PoolingPayloadTokenStream extends TokenStream {
private byte[] payload;
private boolean first;
private ByteArrayPool pool;
private String term;
CharTermAttribute termAtt;
PayloadAttribute payloadAtt;
PoolingPayloadTokenStream(ByteArrayPool pool) {
this.pool = pool;
payload = pool.get();
generateRandomData(payload);
term = new String(payload, 0, payload.length, utf8);
first = true;
payloadAtt = addAttribute(PayloadAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
}
@Override
public boolean incrementToken() throws IOException {
if (!first) return false;
first = false;
clearAttributes();
termAtt.append(term);
payloadAtt.setPayload(new BytesRef(payload));
return true;
}
@Override
public void close() throws IOException {
pool.release(payload);
}
synchronized void release(byte[] b) {
pool.add(b);
}
private static class ByteArrayPool {
private List<byte[]> pool;
ByteArrayPool(int capacity, int size) {
pool = new ArrayList<>();
for (int i = 0; i < capacity; i++) {
pool.add(new byte[size]);
}
}
synchronized byte[] get() {
return pool.remove(0);
}
synchronized void release(byte[] b) {
pool.add(b);
}
synchronized int size() {
return pool.size();
}
synchronized int size() {
return pool.size();
}
}
public void testAcrossFields() throws Exception {
Directory dir = newDirectory();
@ -646,5 +645,4 @@ public class TestPayloads extends LuceneTestCase {
reader.close();
dir.close();
}
}

View File

@ -390,9 +390,9 @@ public class TestControlledRealTimeReopenThread extends ThreadedIndexingAndSearc
@Override
public void updateDocument(Term term,
IndexDocument doc, Analyzer analyzer)
IndexDocument doc)
throws IOException {
super.updateDocument(term, doc, analyzer);
super.updateDocument(term, doc);
try {
if (waitAfterUpdate) {
signal.countDown();

View File

@ -2040,22 +2040,22 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
Document doc = new Document();
doc.add(new IntField(NUMERIC_FIELD_NAME, 1, Field.Store.NO));
doc.add(new StoredField(NUMERIC_FIELD_NAME, 1));
writer.addDocument(doc, analyzer);
writer.addDocument(doc);
doc = new Document();
doc.add(new IntField(NUMERIC_FIELD_NAME, 3, Field.Store.NO));
doc.add(new StoredField(NUMERIC_FIELD_NAME, 3));
writer.addDocument(doc, analyzer);
writer.addDocument(doc);
doc = new Document();
doc.add(new IntField(NUMERIC_FIELD_NAME, 5, Field.Store.NO));
doc.add(new StoredField(NUMERIC_FIELD_NAME, 5));
writer.addDocument(doc, analyzer);
writer.addDocument(doc);
doc = new Document();
doc.add(new IntField(NUMERIC_FIELD_NAME, 7, Field.Store.NO));
doc.add(new StoredField(NUMERIC_FIELD_NAME, 7));
writer.addDocument(doc, analyzer);
writer.addDocument(doc);
Document childDoc = doc(FIELD_NAME, "child document");
Document parentDoc = doc(FIELD_NAME, "parent document");

View File

@ -47,7 +47,7 @@ import org.apache.lucene.util.packed.PackedLongValues;
* will be sorted while segments resulting from a flush will be in the order
* in which documents have been added.
* <p><b>NOTE</b>: Never use this policy if you rely on
* {@link IndexWriter#addDocuments(Iterable, Analyzer) IndexWriter.addDocuments}
* {@link IndexWriter#addDocuments(Iterable) IndexWriter.addDocuments}
* to have sequentially-assigned doc IDs, this policy will scatter doc IDs.
* <p><b>NOTE</b>: This policy should only be used with idempotent {@code Sort}s
* so that the order of segments is predictable. For example, using

View File

@ -103,11 +103,6 @@ public class RandomIndexWriter implements Closeable {
* @see IndexWriter#addDocument(org.apache.lucene.index.IndexDocument)
*/
public <T extends IndexableField> void addDocument(final IndexDocument doc) throws IOException {
LuceneTestCase.maybeChangeLiveIndexWriterConfig(r, w.getConfig());
addDocument(doc, w.getAnalyzer());
}
public <T extends IndexableField> void addDocument(final IndexDocument doc, Analyzer a) throws IOException {
LuceneTestCase.maybeChangeLiveIndexWriterConfig(r, w.getConfig());
if (r.nextInt(5) == 3) {
// TODO: maybe, we should simply buffer up added docs
@ -141,9 +136,9 @@ public class RandomIndexWriter implements Closeable {
}
};
}
}, a);
});
} else {
w.addDocument(doc, a);
w.addDocument(doc);
}
maybeCommit();

View File

@ -536,7 +536,7 @@ public final class SolrCore implements SolrInfoMBean, Closeable {
log.warn(logid+"Solr index directory '" + new File(indexDir) + "' doesn't exist."
+ " Creating new index...");
SolrIndexWriter writer = SolrIndexWriter.create("SolrCore.initIndex", indexDir, getDirectoryFactory(), true,
SolrIndexWriter writer = SolrIndexWriter.create(this, "SolrCore.initIndex", indexDir, getDirectoryFactory(), true,
getLatestSchema(), solrConfig.indexConfig, solrDelPolicy, codec);
writer.close();
}

View File

@ -264,7 +264,7 @@ public final class DefaultSolrCoreState extends SolrCoreState implements Recover
}
protected SolrIndexWriter createMainIndexWriter(SolrCore core, String name) throws IOException {
return SolrIndexWriter.create(name, core.getNewIndexDir(),
return SolrIndexWriter.create(core, name, core.getNewIndexDir(),
core.getDirectoryFactory(), false, core.getLatestSchema(),
core.getSolrConfig().indexConfig, core.getDeletionPolicy(), core.getCodec());
}

View File

@ -235,11 +235,11 @@ public class DirectUpdateHandler2 extends UpdateHandler implements SolrCoreState
}
if (cmd.isBlock()) {
writer.updateDocuments(updateTerm, cmd, schema.getIndexAnalyzer());
writer.updateDocuments(updateTerm, cmd);
} else {
Document luceneDocument = cmd.getLuceneDocument();
// SolrCore.verbose("updateDocument",updateTerm,luceneDocument,writer);
writer.updateDocument(updateTerm, luceneDocument, schema.getIndexAnalyzer());
writer.updateDocument(updateTerm, luceneDocument);
}
// SolrCore.verbose("updateDocument",updateTerm,"DONE");
@ -264,9 +264,9 @@ public class DirectUpdateHandler2 extends UpdateHandler implements SolrCoreState
} else {
// allow duplicates
if (cmd.isBlock()) {
writer.addDocuments(cmd, schema.getIndexAnalyzer());
writer.addDocuments(cmd);
} else {
writer.addDocument(cmd.getLuceneDocument(), schema.getIndexAnalyzer());
writer.addDocument(cmd.getLuceneDocument());
}
if (ulog != null) ulog.add(cmd);
@ -442,8 +442,7 @@ public class DirectUpdateHandler2 extends UpdateHandler implements SolrCoreState
RefCounted<IndexWriter> iw = solrCoreState.getIndexWriter(core);
try {
IndexWriter writer = iw.get();
writer.updateDocument(idTerm, luceneDocument, cmd.getReq().getSchema()
.getIndexAnalyzer());
writer.updateDocument(idTerm, luceneDocument);
for (Query q : dbqList) {
writer.deleteDocuments(new DeleteByQueryWrapper(q, core.getLatestSchema()));

View File

@ -17,6 +17,14 @@
package org.apache.solr.update;
import java.io.File;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.DelegatingAnalyzerWrapper;
import org.apache.lucene.index.*;
import org.apache.lucene.index.IndexWriter.IndexReaderWarmer;
import org.apache.lucene.util.InfoStream;
@ -24,16 +32,14 @@ import org.apache.lucene.util.Version;
import org.apache.solr.common.cloud.ZkNodeProps;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.MapSerializable;
import org.apache.solr.core.SolrConfig;
import org.apache.solr.core.PluginInfo;
import org.apache.solr.core.SolrConfig;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.util.SolrPluginUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
import java.util.Map;
import static org.apache.solr.core.Config.assertWarnOrFail;
/**
@ -180,12 +186,23 @@ public class SolrIndexConfig implements MapSerializable {
return l.isEmpty() ? def : l.get(0);
}
public IndexWriterConfig toIndexWriterConfig(IndexSchema schema) {
// so that we can update the analyzer on core reload, we pass null
// for the default analyzer, and explicitly pass an analyzer on
// appropriate calls to IndexWriter
IndexWriterConfig iwc = new IndexWriterConfig(null);
private static class DelayedSchemaAnalyzer extends DelegatingAnalyzerWrapper {
private final SolrCore core;
public DelayedSchemaAnalyzer(SolrCore core) {
super(PER_FIELD_REUSE_STRATEGY);
this.core = core;
}
@Override
protected Analyzer getWrappedAnalyzer(String fieldName) {
return core.getLatestSchema().getIndexAnalyzer();
}
}
public IndexWriterConfig toIndexWriterConfig(SolrCore core) {
IndexSchema schema = core.getLatestSchema();
IndexWriterConfig iwc = new IndexWriterConfig(new DelayedSchemaAnalyzer(core));
if (maxBufferedDocs != -1)
iwc.setMaxBufferedDocs(maxBufferedDocs);

View File

@ -121,7 +121,7 @@ public class SolrIndexSplitter {
} else {
SolrCore core = searcher.getCore();
String path = paths.get(partitionNumber);
iw = SolrIndexWriter.create("SplittingIndexWriter"+partitionNumber + (ranges != null ? " " + ranges.get(partitionNumber) : ""), path,
iw = SolrIndexWriter.create(core, "SplittingIndexWriter"+partitionNumber + (ranges != null ? " " + ranges.get(partitionNumber) : ""), path,
core.getDirectoryFactory(), true, core.getLatestSchema(),
core.getSolrConfig().indexConfig, core.getDeletionPolicy(), core.getCodec());
}

View File

@ -27,8 +27,9 @@ import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.InfoStream;
import org.apache.solr.common.util.IOUtils;
import org.apache.solr.core.DirectoryFactory;
import org.apache.solr.core.DirectoryFactory.DirContext;
import org.apache.solr.core.DirectoryFactory;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.IndexSchema;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -56,12 +57,12 @@ public class SolrIndexWriter extends IndexWriter {
private InfoStream infoStream;
private Directory directory;
public static SolrIndexWriter create(String name, String path, DirectoryFactory directoryFactory, boolean create, IndexSchema schema, SolrIndexConfig config, IndexDeletionPolicy delPolicy, Codec codec) throws IOException {
public static SolrIndexWriter create(SolrCore core, String name, String path, DirectoryFactory directoryFactory, boolean create, IndexSchema schema, SolrIndexConfig config, IndexDeletionPolicy delPolicy, Codec codec) throws IOException {
SolrIndexWriter w = null;
final Directory d = directoryFactory.get(path, DirContext.DEFAULT, config.lockType);
try {
w = new SolrIndexWriter(name, path, d, create, schema,
w = new SolrIndexWriter(core, name, path, d, create, schema,
config, delPolicy, codec);
w.setDirectoryFactory(directoryFactory);
return w;
@ -73,9 +74,9 @@ public class SolrIndexWriter extends IndexWriter {
}
}
private SolrIndexWriter(String name, String path, Directory directory, boolean create, IndexSchema schema, SolrIndexConfig config, IndexDeletionPolicy delPolicy, Codec codec) throws IOException {
private SolrIndexWriter(SolrCore core, String name, String path, Directory directory, boolean create, IndexSchema schema, SolrIndexConfig config, IndexDeletionPolicy delPolicy, Codec codec) throws IOException {
super(directory,
config.toIndexWriterConfig(schema).
config.toIndexWriterConfig(core).
setOpenMode(create ? IndexWriterConfig.OpenMode.CREATE : IndexWriterConfig.OpenMode.APPEND).
setIndexDeletionPolicy(delPolicy).setCodec(codec)
);

View File

@ -116,7 +116,7 @@ public class TestConfig extends SolrTestCaseJ4 {
assertEquals("default useCompoundFile", false, sic.getUseCompoundFile());
IndexSchema indexSchema = IndexSchemaFactory.buildIndexSchema("schema.xml", solrConfig);
IndexWriterConfig iwc = sic.toIndexWriterConfig(indexSchema);
IndexWriterConfig iwc = sic.toIndexWriterConfig(h.getCore());
assertNotNull("null mp", iwc.getMergePolicy());
assertTrue("mp is not TMP", iwc.getMergePolicy() instanceof TieredMergePolicy);

View File

@ -31,7 +31,7 @@ public class TestInfoStreamLogging extends SolrTestCaseJ4 {
}
public void testIndexConfig() throws Exception {
IndexWriterConfig iwc = solrConfig.indexConfig.toIndexWriterConfig(h.getCore().getLatestSchema());
IndexWriterConfig iwc = solrConfig.indexConfig.toIndexWriterConfig(h.getCore());
assertTrue(iwc.getInfoStream() instanceof LoggingInfoStream);
}

View File

@ -45,7 +45,7 @@ public class TestMergePolicyConfig extends SolrTestCaseJ4 {
public void testDefaultMergePolicyConfig() throws Exception {
initCore("solrconfig-mergepolicy-defaults.xml","schema-minimal.xml");
IndexWriterConfig iwc = solrConfig.indexConfig.toIndexWriterConfig(h.getCore().getLatestSchema());
IndexWriterConfig iwc = solrConfig.indexConfig.toIndexWriterConfig(h.getCore());
assertEquals(false, iwc.getUseCompoundFile());
TieredMergePolicy tieredMP = assertAndCast(TieredMergePolicy.class,
@ -61,7 +61,7 @@ public class TestMergePolicyConfig extends SolrTestCaseJ4 {
= Boolean.parseBoolean(System.getProperty("useCompoundFile"));
initCore("solrconfig-mergepolicy-legacy.xml","schema-minimal.xml");
IndexWriterConfig iwc = solrConfig.indexConfig.toIndexWriterConfig(h.getCore().getLatestSchema());
IndexWriterConfig iwc = solrConfig.indexConfig.toIndexWriterConfig(h.getCore());
assertEquals(expectCFS, iwc.getUseCompoundFile());
@ -81,7 +81,7 @@ public class TestMergePolicyConfig extends SolrTestCaseJ4 {
= Boolean.parseBoolean(System.getProperty("useCompoundFile"));
initCore("solrconfig-tieredmergepolicy.xml","schema-minimal.xml");
IndexWriterConfig iwc = solrConfig.indexConfig.toIndexWriterConfig(h.getCore().getLatestSchema());
IndexWriterConfig iwc = solrConfig.indexConfig.toIndexWriterConfig(h.getCore());
assertEquals(expectCFS, iwc.getUseCompoundFile());
@ -122,7 +122,7 @@ public class TestMergePolicyConfig extends SolrTestCaseJ4 {
System.setProperty("solr.test.log.merge.policy", mpClass.getName());
initCore("solrconfig-logmergepolicy.xml","schema-minimal.xml");
IndexWriterConfig iwc = solrConfig.indexConfig.toIndexWriterConfig(h.getCore().getLatestSchema());
IndexWriterConfig iwc = solrConfig.indexConfig.toIndexWriterConfig(h.getCore());
// verify some props set to -1 get lucene internal defaults
assertEquals(-1, solrConfig.indexConfig.maxBufferedDocs);

View File

@ -47,7 +47,7 @@ public class TestSolrIndexConfig extends SolrTestCaseJ4 {
public void testIndexConfigParsing() throws Exception {
IndexWriterConfig iwc = solrConfig.indexConfig.toIndexWriterConfig(h.getCore().getLatestSchema());
IndexWriterConfig iwc = solrConfig.indexConfig.toIndexWriterConfig(h.getCore());
try {
checkIndexWriterConfig(iwc);
} finally {

View File

@ -17,6 +17,10 @@ package org.apache.solr.update;
* limitations under the License.
*/
import java.io.File;
import java.io.IOException;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.SimpleMergedSegmentWarmer;
@ -26,13 +30,10 @@ import org.apache.solr.core.SolrConfig;
import org.apache.solr.core.TestMergePolicyConfig;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.IndexSchemaFactory;
import org.junit.BeforeClass;
import org.junit.Test;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import java.io.File;
import java.io.IOException;
/**
* Testcase for {@link SolrIndexConfig}
*
@ -40,13 +41,19 @@ import java.io.IOException;
*/
public class SolrIndexConfigTest extends SolrTestCaseJ4 {
@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig.xml","schema.xml");
}
@Test
public void testFailingSolrIndexConfigCreation() {
try {
SolrConfig solrConfig = new SolrConfig("bad-mp-solrconfig.xml");
SolrIndexConfig solrIndexConfig = new SolrIndexConfig(solrConfig, null, null);
IndexSchema indexSchema = IndexSchemaFactory.buildIndexSchema("schema.xml", solrConfig);
solrIndexConfig.toIndexWriterConfig(indexSchema);
h.getCore().setLatestSchema(indexSchema);
solrIndexConfig.toIndexWriterConfig(h.getCore());
fail("a mergePolicy should have an empty constructor in order to be instantiated in Solr thus this should fail ");
} catch (Exception e) {
// it failed as expected
@ -61,8 +68,9 @@ public class SolrIndexConfigTest extends SolrTestCaseJ4 {
null);
assertNotNull(solrIndexConfig);
IndexSchema indexSchema = IndexSchemaFactory.buildIndexSchema("schema.xml", solrConfig);
IndexWriterConfig iwc = solrIndexConfig.toIndexWriterConfig(indexSchema);
h.getCore().setLatestSchema(indexSchema);
IndexWriterConfig iwc = solrIndexConfig.toIndexWriterConfig(h.getCore());
assertNotNull("null mp", iwc.getMergePolicy());
assertTrue("mp is not TMP", iwc.getMergePolicy() instanceof TieredMergePolicy);
@ -87,7 +95,8 @@ public class SolrIndexConfigTest extends SolrTestCaseJ4 {
assertEquals(SimpleMergedSegmentWarmer.class.getName(),
solrIndexConfig.mergedSegmentWarmerInfo.className);
IndexSchema indexSchema = IndexSchemaFactory.buildIndexSchema("schema.xml", solrConfig);
IndexWriterConfig iwc = solrIndexConfig.toIndexWriterConfig(indexSchema);
h.getCore().setLatestSchema(indexSchema);
IndexWriterConfig iwc = solrIndexConfig.toIndexWriterConfig(h.getCore());
assertEquals(SimpleMergedSegmentWarmer.class, iwc.getMergedSegmentWarmer().getClass());
}