LUCENE-3969: demote the n-grams again (with explanation)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311915 13f79535-47bb-0310-9956-ffa450edef68
2012-04-10 18:36:34 +00:00 · 2012-04-10 18:36:34 +00:00 · c58dfd5516
parent ad994d8281
commit c58dfd5516
1 changed files with 16 additions and 9 deletions
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@ -113,7 +113,22 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
                                 LimitTokenCountFilter.class,
                                 // Not broken: we forcefully add this, so we shouldn't
                                 // also randomly pick it:
-                                 ValidatingTokenFilter.class
+                                 ValidatingTokenFilter.class,
+                                 // NOTE: these by themselves won't cause any 'basic assertions' to fail.
+                                 // but see https://issues.apache.org/jira/browse/LUCENE-3920, if any 
+                                 // tokenfilter that combines words (e.g. shingles) comes after them,
+                                 // this will create bogus offsets because their 'offsets go backwards',
+                                 // causing shingle or whatever to make a single token with a 
+                                 // startOffset thats > its endOffset
+                                 // (see LUCENE-3738 for a list of other offenders here)
+                                 // broken!
+                                 NGramTokenizer.class,
+                                 // broken!
+                                 NGramTokenFilter.class,
+                                 // broken!
+                                 EdgeNGramTokenizer.class,
+                                 // broken!
+                                 EdgeNGramTokenFilter.class
    );
  }

@ -130,14 +145,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
                                 DictionaryCompoundWordTokenFilter.class,
                                 // nocommit: corrumpts graphs (offset consistency check):
                                 PositionFilter.class,
-                                 // broken!
-                                 NGramTokenizer.class,
-                                 // broken!
-                                 NGramTokenFilter.class,
-                                 // broken!
-                                 EdgeNGramTokenizer.class,
-                                 // broken!
-                                 EdgeNGramTokenFilter.class,
                                 // nocommit it seems to mess up offsets!?
                                 WikipediaTokenizer.class
                                 );