LCUENE-2016: remap invalid U+FFFF char during indexing, to prevent silent corruption

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@831041 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2009-10-29 17:30:46 +00:00
parent 88d3d41992
commit de0aaadb81
3 changed files with 24 additions and 3 deletions

View File

@ -164,6 +164,10 @@ Bug fixes
* LUCENE-2013: SpanRegexQuery does not work with QueryScorer.
(Benjamin Keil via Mark Miller)
* LUCENE-2016: Replace illegal U+FFFF character with the replacement
char (U+FFFD) during indexing, to prevent silent index corruption.
(Peter Keegan, Mike McCandless)
New features
* LUCENE-1933: Provide a convenience AttributeFactory that creates a

View File

@ -377,9 +377,11 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
}
}
} else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END)
// Unpaired
} else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && (ch <= UnicodeUtil.UNI_SUR_HIGH_END ||
ch == 0xffff)) {
// Unpaired or 0xffff
ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
}
code = (code*31) + ch;
}

View File

@ -29,7 +29,6 @@ import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.HashSet;
import java.util.Random;
import org.apache.lucene.util.LuceneTestCase;
@ -4523,4 +4522,20 @@ public class TestIndexWriter extends LuceneTestCase {
w.close();
d.close();
}
public void testEmbeddedFFFF() throws Throwable {
Directory d = new MockRAMDirectory();
IndexWriter w = new IndexWriter(d, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED);
Document doc = new Document();
doc.add(new Field("field", "a a\uffffb", Field.Store.NO, Field.Index.ANALYZED));
w.addDocument(doc);
doc = new Document();
doc.add(new Field("field", "a", Field.Store.NO, Field.Index.ANALYZED));
w.addDocument(doc);
w.close();
_TestUtil.checkIndex(d);
d.close();
}
}