From aac4e16774d5f110c8143c6d41c189a35418c086 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Mon, 26 Jul 2010 19:31:34 +0000 Subject: [PATCH] LUCENE-2554: add comment explaining why we can't assert valid UTF8 when dancing git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/preflexfixes@979415 13f79535-47bb-0310-9956-ffa450edef68 --- .../apache/lucene/index/codecs/preflex/PreFlexFields.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java index 473bff94a20..9351a3324dc 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java @@ -290,9 +290,10 @@ public class PreFlexFields extends FieldsProducer { // unicode character: assert isHighBMPChar(term.bytes, pos); - // TODO: understand why this assert sometimes (rarely) - // trips! - // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3); + // NOTE: we cannot make this assert, because + // AutomatonQuery legitimately sends us malformed UTF8 + // (eg the UTF8 bytes with just 0xee) + // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString(); // Save the bytes && length, since we need to // restore this if seek "back" finds no matching