LUCENE-3690: Re-implemented HTMLStripCharFilter as a JFlex-generated scanner. Fixes LUCENE-2208, SOLR-882, and SOLR-42.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1234452 13f79535-47bb-0310-9956-ffa450edef68
2012-01-22 05:20:46 +00:00 · 2012-01-22 05:20:46 +00:00 · f3a363708f
parent 17fe719bb5
commit f3a363708f
22 changed files with 36700 additions and 1358 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -793,6 +793,9 @@ New Features
 * LUCENE-3121: Add TypeTokenFilter that filters tokens based on
  their TypeAttribute.  (Tommaso Teofili via Uwe Schindler)

+* LUCENE-3690: Added HTMLStripCharFilter, a CharFilter that strips HTML
+  markup. (Steve Rowe)
+  
 Bug fixes

 * LUCENE-3595: Fixed FieldCacheRangeFilter and FieldCacheTermsFilter
--- a/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java
+++ b/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java
@ -249,7 +249,42 @@ public class _TestUtil {
    }
  }
  
-  // TODO: make this more evil
+  private static final String[] HTML_CHAR_ENTITIES = {
+      "AElig", "Aacute", "Acirc", "Agrave", "Alpha", "AMP", "Aring", "Atilde",
+      "Auml", "Beta", "COPY", "Ccedil", "Chi", "Dagger", "Delta", "ETH",
+      "Eacute", "Ecirc", "Egrave", "Epsilon", "Eta", "Euml", "Gamma", "GT",
+      "Iacute", "Icirc", "Igrave", "Iota", "Iuml", "Kappa", "Lambda", "LT",
+      "Mu", "Ntilde", "Nu", "OElig", "Oacute", "Ocirc", "Ograve", "Omega",
+      "Omicron", "Oslash", "Otilde", "Ouml", "Phi", "Pi", "Prime", "Psi",
+      "QUOT", "REG", "Rho", "Scaron", "Sigma", "THORN", "Tau", "Theta",
+      "Uacute", "Ucirc", "Ugrave", "Upsilon", "Uuml", "Xi", "Yacute", "Yuml",
+      "Zeta", "aacute", "acirc", "acute", "aelig", "agrave", "alefsym",
+      "alpha", "amp", "and", "ang", "apos", "aring", "asymp", "atilde",
+      "auml", "bdquo", "beta", "brvbar", "bull", "cap", "ccedil", "cedil",
+      "cent", "chi", "circ", "clubs", "cong", "copy", "crarr", "cup",
+      "curren", "dArr", "dagger", "darr", "deg", "delta", "diams", "divide",
+      "eacute", "ecirc", "egrave", "empty", "emsp", "ensp", "epsilon",
+      "equiv", "eta", "eth", "euml", "euro", "exist", "fnof", "forall",
+      "frac12", "frac14", "frac34", "frasl", "gamma", "ge", "gt", "hArr",
+      "harr", "hearts", "hellip", "iacute", "icirc", "iexcl", "igrave",
+      "image", "infin", "int", "iota", "iquest", "isin", "iuml", "kappa",
+      "lArr", "lambda", "lang", "laquo", "larr", "lceil", "ldquo", "le",
+      "lfloor", "lowast", "loz", "lrm", "lsaquo", "lsquo", "lt", "macr",
+      "mdash", "micro", "middot", "minus", "mu", "nabla", "nbsp", "ndash",
+      "ne", "ni", "not", "notin", "nsub", "ntilde", "nu", "oacute", "ocirc",
+      "oelig", "ograve", "oline", "omega", "omicron", "oplus", "or", "ordf",
+      "ordm", "oslash", "otilde", "otimes", "ouml", "para", "part", "permil",
+      "perp", "phi", "pi", "piv", "plusmn", "pound", "prime", "prod", "prop",
+      "psi", "quot", "rArr", "radic", "rang", "raquo", "rarr", "rceil",
+      "rdquo", "real", "reg", "rfloor", "rho", "rlm", "rsaquo", "rsquo",
+      "sbquo", "scaron", "sdot", "sect", "shy", "sigma", "sigmaf", "sim",
+      "spades", "sub", "sube", "sum", "sup", "sup1", "sup2", "sup3", "supe",
+      "szlig", "tau", "there4", "theta", "thetasym", "thinsp", "thorn",
+      "tilde", "times", "trade", "uArr", "uacute", "uarr", "ucirc", "ugrave",
+      "uml", "upsih", "upsilon", "uuml", "weierp", "xi", "yacute", "yen",
+      "yuml", "zeta", "zwj", "zwnj"
+  };
+  
  public static String randomHtmlishString(Random random, int numElements) {
    final int end = random.nextInt(numElements);
    if (end == 0) {
@ -258,17 +293,80 @@ public class _TestUtil {
    }
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < end; i++) {
-      int val = random.nextInt(10);
+      int val = random.nextInt(25);
      switch(val) {
        case 0: sb.append("<p>"); break;
-        case 1: sb.append("</p>"); break;
-        case 2: sb.append("<!--"); break;
-        case 3: sb.append("-->"); break;
-        case 4: sb.append("&#"); break;
-        case 5: sb.append(";"); break;
-        case 6: sb.append((char)_TestUtil.nextInt(random, '0', '9')); break;
-        default:
-          sb.append((char)_TestUtil.nextInt(random, 'a', 'z'));
+        case 1: {
+          sb.append("<");
+          sb.append("    ".substring(nextInt(random, 0, 4)));
+          sb.append(randomSimpleString(random));
+          for (int j = 0 ; j < nextInt(random, 0, 10) ; ++j) {
+            sb.append(' ');
+            sb.append(randomSimpleString(random));
+            sb.append(" ".substring(nextInt(random, 0, 1)));
+            sb.append('=');
+            sb.append(" ".substring(nextInt(random, 0, 1)));
+            sb.append("\"".substring(nextInt(random, 0, 1)));
+            sb.append(randomSimpleString(random));
+            sb.append("\"".substring(nextInt(random, 0, 1)));
+          }
+          sb.append("    ".substring(nextInt(random, 0, 4)));
+          sb.append("/".substring(nextInt(random, 0, 1)));
+          sb.append(">".substring(nextInt(random, 0, 1)));
+          break;
+        }
+        case 2: {
+          sb.append("</");
+          sb.append("    ".substring(nextInt(random, 0, 4)));
+          sb.append(randomSimpleString(random));
+          sb.append("    ".substring(nextInt(random, 0, 4)));
+          sb.append(">".substring(nextInt(random, 0, 1)));
+          break;
+        }
+        case 3: sb.append(">"); break;
+        case 4: sb.append("</p>"); break;
+        case 5: sb.append("<!--"); break;
+        case 6: sb.append("<!--#"); break;
+        case 7: sb.append("<script><!-- f('"); break;
+        case 8: sb.append("</script>"); break;
+        case 9: sb.append("<?"); break;
+        case 10: sb.append("?>"); break;
+        case 11: sb.append("\""); break;
+        case 12: sb.append("\\\""); break;
+        case 13: sb.append("'"); break;
+        case 14: sb.append("\\'"); break;
+        case 15: sb.append("-->"); break;
+        case 16: {
+          sb.append("&");
+          switch(nextInt(random, 0, 2)) {
+            case 0: sb.append(randomSimpleString(random)); break;
+            case 1: sb.append(HTML_CHAR_ENTITIES[random.nextInt(HTML_CHAR_ENTITIES.length)]); break;
+          }
+          sb.append(";".substring(nextInt(random, 0, 1)));
+          break;
+        }
+        case 17: {
+          sb.append("&#");
+          if (0 == nextInt(random, 0, 1)) {
+            sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1));
+            sb.append(";".substring(nextInt(random, 0, 1)));
+          }
+          break;
+        } 
+        case 18: {
+          sb.append("&#x");
+          if (0 == nextInt(random, 0, 1)) {
+            sb.append(Integer.toString(nextInt(random, 0, Integer.MAX_VALUE - 1), 16));
+            sb.append(";".substring(nextInt(random, 0, 1)));
+          }
+          break;
+        }
+          
+        case 19: sb.append(";"); break;
+        case 20: sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1)); break;
+        case 21: sb.append("\n");
+        case 22: sb.append("          ".substring(nextInt(random, 0, 10)));
+        default: sb.append(randomSimpleString(random));
      }
    }
    return sb.toString();
--- a/modules/analysis/common/build.xml
+++ b/modules/analysis/common/build.xml
@ -31,7 +31,8 @@
  <target name="compile-core" depends="jflex-notice, common.compile-core"/>

  <target name="jflex" depends="jflex-check,clean-jflex,gen-uax29-supp-macros,
-                                jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,jflex-wiki-tokenizer"/>
+                                jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,
+                                jflex-wiki-tokenizer,jflex-HTMLStripCharFilter"/>

  <target name="gen-uax29-supp-macros">
    <subant target="gen-uax29-supp-macros">
@ -39,6 +40,29 @@
    </subant>
  </target>

+  <target name="jflex-HTMLStripCharFilter"
+          depends="init,jflex-check,generate-jflex-html-char-entities"
+          if="jflex.present">
+    <taskdef classname="jflex.anttask.JFlexTask" name="jflex">
+      <classpath refid="jflex.classpath"/>
+    </taskdef>
+    <jflex file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex"
+           outdir="src/java/org/apache/lucene/analysis/charfilter"
+           nobak="on"/>
+    <!-- Remove the inappropriate JFlex-generated constructors -->
+    <replaceregexp file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java"
+                   match="/\*\*\s*\*\s*Creates a new scanner.*this\(new java\.io\.InputStreamReader\(in\)\);\s*\}"
+                   replace="" flags="sg"/>
+  </target>
+
+  <target name="generate-jflex-html-char-entities">
+    <exec dir="src/java/org/apache/lucene/analysis/charfilter"
+          output="src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex"
+          executable="${python.exe}" failonerror="true" logerror="true">
+      <arg value="htmlentity.py"/>
+    </exec>
+  </target>
+
  <target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
    <taskdef classname="jflex.anttask.JFlexTask" name="jflex">
      <classpath refid="jflex.classpath"/>
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java
@ -20,6 +20,8 @@ package org.apache.lucene.analysis.charfilter;
 import org.apache.lucene.analysis.CharStream;
 import org.apache.lucene.util.ArrayUtil;

+import java.util.Arrays;
+
 /**
 * Base utility class for implementing a {@link CharFilter}.
 * You subclass this, and then record mappings by calling
@ -71,6 +73,19 @@ public abstract class BaseCharFilter extends CharFilter {
      0 : diffs[size-1];
  }

+  /**
+   * <p>
+   *   Adds an offset correction mapping at the given output stream offset.
+   * </p>
+   * <p>
+   *   Assumption: the offset given with each successive call to this method
+   *   will not be smaller than the offset given at the previous invocation.
+   * </p>
+   *
+   * @param off The output stream offset at which to apply the correction
+   * @param cumulativeDiff The input offset is given by adding this
+   *                       to the output offset
+   */
  protected void addOffCorrectMap(int off, int cumulativeDiff) {
    if (offsets == null) {
      offsets = new int[64];
@ -80,7 +95,15 @@ public abstract class BaseCharFilter extends CharFilter {
      diffs = ArrayUtil.grow(diffs);
    }
    
+    assert (size == 0 || off >= offsets[size])
+        : "Offset #" + size + "(" + off + ") is less than the last recorded offset "
+          + offsets[size] + "\n" + Arrays.toString(offsets) + "\n" + Arrays.toString(diffs);
+    
+    if (size == 0 || off != offsets[size - 1]) {
      offsets[size] = off;
      diffs[size++] = cumulativeDiff;
+    } else { // Overwrite the diff at the last recorded offset
+      diffs[size - 1] = cumulativeDiff;
+    }
  }
 }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex
@ -0,0 +1,153 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+CharacterEntities = ( "AElig" | "Aacute" | "Acirc" | "Agrave" | "Alpha"
+                    | "Aring" | "Atilde" | "Auml" | "Beta" | "Ccedil" | "Chi"
+                    | "Dagger" | "Delta" | "ETH" | "Eacute" | "Ecirc"
+                    | "Egrave" | "Epsilon" | "Eta" | "Euml" | "Gamma"
+                    | "Iacute" | "Icirc" | "Igrave" | "Iota" | "Iuml" | "Kappa"
+                    | "Lambda" | "Mu" | "Ntilde" | "Nu" | "OElig" | "Oacute"
+                    | "Ocirc" | "Ograve" | "Omega" | "Omicron" | "Oslash"
+                    | "Otilde" | "Ouml" | "Phi" | "Pi" | "Prime" | "Psi"
+                    | "Rho" | "Scaron" | "Sigma" | "THORN" | "Tau" | "Theta"
+                    | "Uacute" | "Ucirc" | "Ugrave" | "Upsilon" | "Uuml" | "Xi"
+                    | "Yacute" | "Yuml" | "Zeta" | "aacute" | "acirc" | "acute"
+                    | "aelig" | "agrave" | "alefsym" | "alpha" | "amp" | "AMP"
+                    | "and" | "ang" | "apos" | "aring" | "asymp" | "atilde"
+                    | "auml" | "bdquo" | "beta" | "brvbar" | "bull" | "cap"
+                    | "ccedil" | "cedil" | "cent" | "chi" | "circ" | "clubs"
+                    | "cong" | "copy" | "COPY" | "crarr" | "cup" | "curren"
+                    | "dArr" | "dagger" | "darr" | "deg" | "delta" | "diams"
+                    | "divide" | "eacute" | "ecirc" | "egrave" | "empty"
+                    | "emsp" | "ensp" | "epsilon" | "equiv" | "eta" | "eth"
+                    | "euml" | "euro" | "exist" | "fnof" | "forall" | "frac12"
+                    | "frac14" | "frac34" | "frasl" | "gamma" | "ge" | "gt"
+                    | "GT" | "hArr" | "harr" | "hearts" | "hellip" | "iacute"
+                    | "icirc" | "iexcl" | "igrave" | "image" | "infin" | "int"
+                    | "iota" | "iquest" | "isin" | "iuml" | "kappa" | "lArr"
+                    | "lambda" | "lang" | "laquo" | "larr" | "lceil" | "ldquo"
+                    | "le" | "lfloor" | "lowast" | "loz" | "lrm" | "lsaquo"
+                    | "lsquo" | "lt" | "LT" | "macr" | "mdash" | "micro"
+                    | "middot" | "minus" | "mu" | "nabla" | "nbsp" | "ndash"
+                    | "ne" | "ni" | "not" | "notin" | "nsub" | "ntilde" | "nu"
+                    | "oacute" | "ocirc" | "oelig" | "ograve" | "oline"
+                    | "omega" | "omicron" | "oplus" | "or" | "ordf" | "ordm"
+                    | "oslash" | "otilde" | "otimes" | "ouml" | "para" | "part"
+                    | "permil" | "perp" | "phi" | "pi" | "piv" | "plusmn"
+                    | "pound" | "prime" | "prod" | "prop" | "psi" | "quot"
+                    | "QUOT" | "rArr" | "radic" | "rang" | "raquo" | "rarr"
+                    | "rceil" | "rdquo" | "real" | "reg" | "REG" | "rfloor"
+                    | "rho" | "rlm" | "rsaquo" | "rsquo" | "sbquo" | "scaron"
+                    | "sdot" | "sect" | "shy" | "sigma" | "sigmaf" | "sim"
+                    | "spades" | "sub" | "sube" | "sum" | "sup" | "sup1"
+                    | "sup2" | "sup3" | "supe" | "szlig" | "tau" | "there4"
+                    | "theta" | "thetasym" | "thinsp" | "thorn" | "tilde"
+                    | "times" | "trade" | "uArr" | "uacute" | "uarr" | "ucirc"
+                    | "ugrave" | "uml" | "upsih" | "upsilon" | "uuml"
+                    | "weierp" | "xi" | "yacute" | "yen" | "yuml" | "zeta"
+                    | "zwj" | "zwnj" )
+%{
+  private static final Set<String> upperCaseVariantsAccepted
+      = new HashSet<String>(Arrays.asList("quot","copy","gt","lt","reg","amp"));
+  private static final CharArrayMap<Character> entityValues
+      = new CharArrayMap<Character>(Version.LUCENE_40, 253, false);
+  static {
+    String[] entities = {
+      "AElig", "\u00C6", "Aacute", "\u00C1", "Acirc", "\u00C2",
+      "Agrave", "\u00C0", "Alpha", "\u0391", "Aring", "\u00C5",
+      "Atilde", "\u00C3", "Auml", "\u00C4", "Beta", "\u0392",
+      "Ccedil", "\u00C7", "Chi", "\u03A7", "Dagger", "\u2021",
+      "Delta", "\u0394", "ETH", "\u00D0", "Eacute", "\u00C9",
+      "Ecirc", "\u00CA", "Egrave", "\u00C8", "Epsilon", "\u0395",
+      "Eta", "\u0397", "Euml", "\u00CB", "Gamma", "\u0393", "Iacute", "\u00CD",
+      "Icirc", "\u00CE", "Igrave", "\u00CC", "Iota", "\u0399",
+      "Iuml", "\u00CF", "Kappa", "\u039A", "Lambda", "\u039B", "Mu", "\u039C",
+      "Ntilde", "\u00D1", "Nu", "\u039D", "OElig", "\u0152",
+      "Oacute", "\u00D3", "Ocirc", "\u00D4", "Ograve", "\u00D2",
+      "Omega", "\u03A9", "Omicron", "\u039F", "Oslash", "\u00D8",
+      "Otilde", "\u00D5", "Ouml", "\u00D6", "Phi", "\u03A6", "Pi", "\u03A0",
+      "Prime", "\u2033", "Psi", "\u03A8", "Rho", "\u03A1", "Scaron", "\u0160",
+      "Sigma", "\u03A3", "THORN", "\u00DE", "Tau", "\u03A4", "Theta", "\u0398",
+      "Uacute", "\u00DA", "Ucirc", "\u00DB", "Ugrave", "\u00D9",
+      "Upsilon", "\u03A5", "Uuml", "\u00DC", "Xi", "\u039E",
+      "Yacute", "\u00DD", "Yuml", "\u0178", "Zeta", "\u0396",
+      "aacute", "\u00E1", "acirc", "\u00E2", "acute", "\u00B4",
+      "aelig", "\u00E6", "agrave", "\u00E0", "alefsym", "\u2135",
+      "alpha", "\u03B1", "amp", "\u0026", "and", "\u2227", "ang", "\u2220",
+      "apos", "\u0027", "aring", "\u00E5", "asymp", "\u2248",
+      "atilde", "\u00E3", "auml", "\u00E4", "bdquo", "\u201E",
+      "beta", "\u03B2", "brvbar", "\u00A6", "bull", "\u2022", "cap", "\u2229",
+      "ccedil", "\u00E7", "cedil", "\u00B8", "cent", "\u00A2", "chi", "\u03C7",
+      "circ", "\u02C6", "clubs", "\u2663", "cong", "\u2245", "copy", "\u00A9",
+      "crarr", "\u21B5", "cup", "\u222A", "curren", "\u00A4", "dArr", "\u21D3",
+      "dagger", "\u2020", "darr", "\u2193", "deg", "\u00B0", "delta", "\u03B4",
+      "diams", "\u2666", "divide", "\u00F7", "eacute", "\u00E9",
+      "ecirc", "\u00EA", "egrave", "\u00E8", "empty", "\u2205",
+      "emsp", "\u2003", "ensp", "\u2002", "epsilon", "\u03B5",
+      "equiv", "\u2261", "eta", "\u03B7", "eth", "\u00F0", "euml", "\u00EB",
+      "euro", "\u20AC", "exist", "\u2203", "fnof", "\u0192",
+      "forall", "\u2200", "frac12", "\u00BD", "frac14", "\u00BC",
+      "frac34", "\u00BE", "frasl", "\u2044", "gamma", "\u03B3", "ge", "\u2265",
+      "gt", "\u003E", "hArr", "\u21D4", "harr", "\u2194", "hearts", "\u2665",
+      "hellip", "\u2026", "iacute", "\u00ED", "icirc", "\u00EE",
+      "iexcl", "\u00A1", "igrave", "\u00EC", "image", "\u2111",
+      "infin", "\u221E", "int", "\u222B", "iota", "\u03B9", "iquest", "\u00BF",
+      "isin", "\u2208", "iuml", "\u00EF", "kappa", "\u03BA", "lArr", "\u21D0",
+      "lambda", "\u03BB", "lang", "\u2329", "laquo", "\u00AB",
+      "larr", "\u2190", "lceil", "\u2308", "ldquo", "\u201C", "le", "\u2264",
+      "lfloor", "\u230A", "lowast", "\u2217", "loz", "\u25CA", "lrm", "\u200E",
+      "lsaquo", "\u2039", "lsquo", "\u2018", "lt", "\u003C", "macr", "\u00AF",
+      "mdash", "\u2014", "micro", "\u00B5", "middot", "\u00B7",
+      "minus", "\u2212", "mu", "\u03BC", "nabla", "\u2207", "nbsp", " ",
+      "ndash", "\u2013", "ne", "\u2260", "ni", "\u220B", "not", "\u00AC",
+      "notin", "\u2209", "nsub", "\u2284", "ntilde", "\u00F1", "nu", "\u03BD",
+      "oacute", "\u00F3", "ocirc", "\u00F4", "oelig", "\u0153",
+      "ograve", "\u00F2", "oline", "\u203E", "omega", "\u03C9",
+      "omicron", "\u03BF", "oplus", "\u2295", "or", "\u2228", "ordf", "\u00AA",
+      "ordm", "\u00BA", "oslash", "\u00F8", "otilde", "\u00F5",
+      "otimes", "\u2297", "ouml", "\u00F6", "para", "\u00B6", "part", "\u2202",
+      "permil", "\u2030", "perp", "\u22A5", "phi", "\u03C6", "pi", "\u03C0",
+      "piv", "\u03D6", "plusmn", "\u00B1", "pound", "\u00A3",
+      "prime", "\u2032", "prod", "\u220F", "prop", "\u221D", "psi", "\u03C8",
+      "quot", "\"", "rArr", "\u21D2", "radic", "\u221A", "rang", "\u232A",
+      "raquo", "\u00BB", "rarr", "\u2192", "rceil", "\u2309",
+      "rdquo", "\u201D", "real", "\u211C", "reg", "\u00AE", "rfloor", "\u230B",
+      "rho", "\u03C1", "rlm", "\u200F", "rsaquo", "\u203A", "rsquo", "\u2019",
+      "sbquo", "\u201A", "scaron", "\u0161", "sdot", "\u22C5",
+      "sect", "\u00A7", "shy", "\u00AD", "sigma", "\u03C3", "sigmaf", "\u03C2",
+      "sim", "\u223C", "spades", "\u2660", "sub", "\u2282", "sube", "\u2286",
+      "sum", "\u2211", "sup", "\u2283", "sup1", "\u00B9", "sup2", "\u00B2",
+      "sup3", "\u00B3", "supe", "\u2287", "szlig", "\u00DF", "tau", "\u03C4",
+      "there4", "\u2234", "theta", "\u03B8", "thetasym", "\u03D1",
+      "thinsp", "\u2009", "thorn", "\u00FE", "tilde", "\u02DC",
+      "times", "\u00D7", "trade", "\u2122", "uArr", "\u21D1",
+      "uacute", "\u00FA", "uarr", "\u2191", "ucirc", "\u00FB",
+      "ugrave", "\u00F9", "uml", "\u00A8", "upsih", "\u03D2",
+      "upsilon", "\u03C5", "uuml", "\u00FC", "weierp", "\u2118",
+      "xi", "\u03BE", "yacute", "\u00FD", "yen", "\u00A5", "yuml", "\u00FF",
+      "zeta", "\u03B6", "zwj", "\u200D", "zwnj", "\u200C"
+    };
+    for (int i = 0 ; i < entities.length ; i += 2) {
+      Character value = entities[i + 1].charAt(0);
+      entityValues.put(entities[i], value);
+      if (upperCaseVariantsAccepted.contains(entities[i])) {
+        entityValues.put(entities[i].toUpperCase(), value);
+      }
+    }
+  }
+%}
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
@ -0,0 +1,58 @@
+/*
+ * Copyright 2010 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Generated using ICU4J 4.8.1.1 on Friday, January 13, 2012 6:20:39 PM UTC
+// by org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros
+
+
+ID_Start_Supp = (
+	  [\uD81A][\uDC00-\uDE38]
+	| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF]
+	| [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB]
+	| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF]
+	| [\uD82C][\uDC00\uDC01]
+	| [\uD804][\uDC03-\uDC37\uDC83-\uDCAF]
+	| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF]
+	| [\uD87E][\uDC00-\uDE1D]
+	| [\uD809][\uDC00-\uDC62]
+	| [\uD808][\uDC00-\uDF6E]
+	| [\uD803][\uDC00-\uDC48]
+	| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]
+	| [\uD80D][\uDC00-\uDC2E]
+	| [\uD86E][\uDC00-\uDC1D]
+	| [\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
+	| [\uD801][\uDC00-\uDC9D]
+)
+ID_Continue_Supp = (
+	  [\uD81A][\uDC00-\uDE38]
+	| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF]
+	| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF]
+	| [\uD804][\uDC00-\uDC46\uDC66-\uDC6F\uDC80-\uDCBA]
+	| [\uD82C][\uDC00\uDC01]
+	| [\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00-\uDE03\uDE05\uDE06\uDE0C-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE38-\uDE3A\uDE3F\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
+	| [\uD801][\uDC00-\uDC9D\uDCA0-\uDCA9]
+	| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF]
+	| [\uD87E][\uDC00-\uDE1D]
+	| [\uD809][\uDC00-\uDC62]
+	| [\uD808][\uDC00-\uDF6E]
+	| [\uD803][\uDC00-\uDC48]
+	| [\uD80D][\uDC00-\uDC2E]
+	| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDDFD\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]
+	| [\uD86E][\uDC00-\uDC1D]
+	| [\uDB40][\uDD00-\uDDEF]
+	| [\uD834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44]
+	| [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB\uDFCE-\uDFFF]
+)
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
@ -0,0 +1,737 @@
+package org.apache.lucene.analysis.charfilter;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.util.Version;
+import org.apache.lucene.analysis.CharStream;
+import org.apache.lucene.analysis.util.CharArrayMap;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.OpenStringBuilder;
+
+
+/**
+ * A CharFilter that wraps another Reader and attempts to strip out HTML constructs.
+ */
+%%
+
+%unicode 6.0
+%apiprivate
+%type int
+%final
+%public
+%char
+%function nextChar
+%class HTMLStripCharFilter
+%extends BaseCharFilter
+%xstate AMPERSAND, NUMERIC_CHARACTER, CHARACTER_REFERENCE_TAIL
+%xstate LEFT_ANGLE_BRACKET, BANG, COMMENT, SCRIPT, SCRIPT_COMMENT
+%xstate LEFT_ANGLE_BRACKET_SLASH, LEFT_ANGLE_BRACKET_SPACE, CDATA
+%xstate SERVER_SIDE_INCLUDE, SINGLE_QUOTED_STRING, DOUBLE_QUOTED_STRING
+%xstate END_TAG_TAIL_INCLUDE, END_TAG_TAIL_EXCLUDE, END_TAG_TAIL_SUBSTITUTE
+%xstate START_TAG_TAIL_INCLUDE, START_TAG_TAIL_EXCLUDE, START_TAG_TAIL_SUBSTITUTE
+%xstate STYLE, STYLE_COMMENT
+
+// From XML 1.0 <http://www.w3.org/TR/xml/>:
+//
+//    [4]  NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [...]
+//    [4a] NameChar      ::= NameStartChar | "-" | "." | [0-9] | [...]
+//    [5]  Name          ::= NameStartChar (NameChar)*
+//
+// From UAX #31: Unicode Identifier and Pattern Syntax
+// <http://unicode.org/reports/tr31/>:
+//
+//    D1. Default Identifier Syntax
+//
+//        <identifier> := <ID_Start> <ID_Continue>*
+//
+Name = ( ( [:_\p{ID_Start}] | {ID_Start_Supp} ) ( [-.:_\p{ID_Continue}] | {ID_Continue_Supp} )* )
+
+// From Apache httpd mod_include documentation
+// <http://httpd.apache.org/docs/current/mod/mod_include.html>:
+//
+// Basic Elements
+//
+//    The document is parsed as an HTML document, with special commands
+//    embedded as SGML comments. A command has the syntax:
+//
+//       <!--#element attribute=value attribute=value ... -->
+//
+//    The value will often be enclosed in double quotes, but single quotes (')
+//    and backticks (`) are also possible. Many commands only allow a single
+//    attribute-value pair. Note that the comment terminator (-->) should be
+//    preceded by whitespace to ensure that it isn't considered part of an SSI
+//    token. Note that the leading <!--# is one token and may not contain any
+//    whitespaces.
+//
+
+EventAttributeSuffixes = ( [aA][bB][oO][rR][tT]                 |
+                           [bB][lL][uU][rR]                     |
+                           [cC][hH][aA][nN][gG][eE]             |
+                           [cC][lL][iI][cC][kK]                 |
+	                         [dD][bB][lL][cC][lL][iI][cC][kK]     |
+                           [eE][rR][rR][oO][rR]                 |
+                           [fF][oO][cC][uU][sS]                 |
+	                         [kK][eE][yY][dD][oO][wW][nN]         |
+	                         [kK][eE][yY][pP][rR][eE][sS][sS]     |
+	                         [kK][eE][yY][uU][pP]                 |
+                           [lL][oO][aA][dD]                     |
+	                         [mM][oO][uU][sS][eE][dD][oO][wW][nN] |
+	                         [mM][oO][uU][sS][eE][mM][oO][vV][eE] |
+                           [mM][oO][uU][sS][eE][oO][uU][tT]     |
+                           [mM][oO][uU][sS][eE][oO][vV][eE][rR] |
+	                         [mM][oO][uU][sS][eE][uU][pP]         |
+                           [rR][eE][sS][eE][tT]                 |
+                           [sS][eE][lL][eE][cC][tT]             |
+                           [sS][uU][bB][mM][iI][tT]             |
+                           [uU][nN][lL][oO][aA][dD]             )
+
+SingleQuoted = ( "'" ( "\\'" | [^']* )* "'" )
+DoubleQuoted = ( "\"" ( "\\\"" | [^\"]* )* "\"" )
+ServerSideInclude = ( "<!--#" ( [^'\"] | {SingleQuoted} | {DoubleQuoted} )* "-->" )
+EventAttribute = [oO][nN] {EventAttributeSuffixes} \s* "=" \s* ( {SingleQuoted} | {DoubleQuoted} )
+OpenTagContent = ( {EventAttribute} | [^<>] | {ServerSideInclude} )*
+
+InlineElment = ( [aAbBiIqQsSuU]                   |
+                 [aA][bB][bB][rR]                 |
+                 [aA][cC][rR][oO][nN][yY][mM]     |
+                 [bB][aA][sS][eE][fF][oO][nN][tT] |
+                 [bB][dD][oO]                     |
+                 [bB][iI][gG]                     |
+                 [cC][iI][tT][eE]                 |
+                 [cC][oO][dD][eE]                 |
+                 [dD][fF][nN]                     |
+                 [eE][mM]                         |
+                 [fF][oO][nN][tT]                 |
+                 [iI][mM][gG]                     |
+                 [iI][nN][pP][uU][tT]             |
+                 [kK][bB][dD]                     |
+                 [lL][aA][bB][eE][lL]             |
+                 [sS][aA][mM][pP]                 |
+                 [sS][eE][lL][eE][cC][tT]         |
+                 [sS][mM][aA][lL][lL]             |
+                 [sS][pP][aA][nN]                 |
+                 [sS][tT][rR][iI][kK][eE]         |
+                 [sS][tT][rR][oO][nN][gG]         |
+                 [sS][uU][bB]                     |
+                 [sS][uU][pP]                     |
+                 [tT][eE][xX][tT][aA][rR][eE][aA] |
+                 [tT][tT]                         |
+                 [vV][aA][rR]                     )
+
+
+%include src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex
+
+%include src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
+
+%{
+  private static final int INITIAL_INPUT_SEGMENT_SIZE = 1024;
+  private static final char BLOCK_LEVEL_START_TAG_REPLACEMENT = '\n';
+  private static final char BLOCK_LEVEL_END_TAG_REPLACEMENT = '\n';
+  private static final char BR_START_TAG_REPLACEMENT = '\n';
+  private static final char BR_END_TAG_REPLACEMENT = '\n';
+  private static final char SCRIPT_REPLACEMENT = '\n';
+  private static final char STYLE_REPLACEMENT = '\n';
+
+  private CharArraySet escapedTags = null;
+  private int inputStart;
+  private int cumulativeDiff;
+  private boolean escapeBR = false;
+  private boolean escapeSCRIPT = false;
+  private boolean escapeSTYLE = false;
+  private int restoreState;
+  private int previousRestoreState;
+  private int outputCharCount;
+  private int eofReturnValue;
+  private TextSegment inputSegment
+      = new TextSegment(INITIAL_INPUT_SEGMENT_SIZE);
+  private TextSegment outputSegment = inputSegment;
+  private TextSegment entitySegment = new TextSegment(2);
+
+  /**
+   * @param source
+   */
+  public HTMLStripCharFilter(CharStream source) {
+    super(source);
+    this.zzReader = source;
+  }
+
+  /**
+   * @param source
+   * @param escapedTags Tags in this set (both start and end tags)
+   *  will not be filtered out.
+   */
+  public HTMLStripCharFilter(CharStream source, Set<String> escapedTags) {
+    super(source);
+    this.zzReader = source;
+    if (null != escapedTags) {
+      for (String tag : escapedTags) {
+        if (tag.equalsIgnoreCase("BR")) {
+          escapeBR = true;
+        } else if (tag.equalsIgnoreCase("SCRIPT")) {
+          escapeSCRIPT = true;
+        } else if (tag.equalsIgnoreCase("STYLE")) {
+          escapeSTYLE = true;
+        } else {
+          if (null == this.escapedTags) {
+            this.escapedTags = new CharArraySet(Version.LUCENE_40, 16, true);
+          }
+          this.escapedTags.add(tag);
+        }
+      }
+    }
+  }
+
+  @Override
+  public int read() throws IOException {
+    if (outputSegment.isRead()) {
+      if (zzAtEOF) {
+        return -1;
+      }
+      int ch = nextChar();
+      ++outputCharCount;
+      return ch;
+    }
+    int ch = outputSegment.nextChar();
+    ++outputCharCount;
+    return ch;
+  }
+
+  @Override
+  public int read(char cbuf[], int off, int len) throws IOException {
+    int i = 0;
+    for ( ; i < len ; ++i) {
+      int ch = read();
+      if (ch == -1) break;
+      cbuf[off++] = (char)ch;
+    }
+    return i > 0 ? i : (len == 0 ? 0 : -1);
+  }
+
+  @Override
+  public void close() throws IOException {
+    yyclose();
+  }
+
+  static int getInitialBufferSize() {  // Package private, for testing purposes
+    return ZZ_BUFFERSIZE;
+  }
+
+  private class TextSegment extends OpenStringBuilder {
+    /** The position from which the next char will be read. */
+    int pos = 0;
+
+    /** Wraps the given buffer and sets this.len to the given length. */
+    TextSegment(char[] buffer, int length) {
+      super(buffer, length);
+    }
+
+    /** Allocates an internal buffer of the given size. */
+    TextSegment(int size) {
+      super(size);
+    }
+
+    /** Sets len = 0 and pos = 0. */
+    void clear() {
+      reset();
+      restart();
+    }
+
+    /** Sets pos = 0 */
+    void restart() {
+      pos = 0;
+    }
+
+    /** Returns the next char in the segment. */
+    int nextChar() {
+      assert (! isRead()): "Attempting to read past the end of a segment.";
+      return buf[pos++];
+    }
+
+    /** Returns true when all characters in the text segment have been read */
+    boolean isRead() {
+      return pos >= len;
+    }
+  }
+%}
+
+%eofval{
+  return eofReturnValue;
+%eofval}
+%eof{
+  switch (zzLexicalState) {
+    case SCRIPT:
+    case COMMENT:
+    case SCRIPT_COMMENT:
+    case STYLE:
+    case STYLE_COMMENT:
+    case SINGLE_QUOTED_STRING:
+    case DOUBLE_QUOTED_STRING:
+    case END_TAG_TAIL_EXCLUDE:
+    case END_TAG_TAIL_SUBSTITUTE:
+    case START_TAG_TAIL_EXCLUDE:
+    case SERVER_SIDE_INCLUDE:
+    case START_TAG_TAIL_SUBSTITUTE: { // Exclude
+      cumulativeDiff += yychar - inputStart;
+      addOffCorrectMap(outputCharCount, cumulativeDiff);
+      outputSegment.clear();
+      eofReturnValue = -1;
+      break;
+    }
+    case CHARACTER_REFERENCE_TAIL: {        // Substitute
+      // At end of file, allow char refs without semicolons
+      cumulativeDiff += inputSegment.length() - outputSegment.length();
+      addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
+      eofReturnValue = outputSegment.nextChar();
+      break;
+    }
+    case BANG:
+    case CDATA:
+    case AMPERSAND:
+    case NUMERIC_CHARACTER:
+    case END_TAG_TAIL_INCLUDE:
+    case START_TAG_TAIL_INCLUDE:
+    case LEFT_ANGLE_BRACKET:
+    case LEFT_ANGLE_BRACKET_SLASH:
+    case LEFT_ANGLE_BRACKET_SPACE: {        // Include
+      outputSegment = inputSegment;
+      eofReturnValue = outputSegment.nextChar();
+      break;
+    }
+    default: {
+      eofReturnValue = -1;
+    }
+  }
+%eof}
+
+%%
+
+"&" {
+  inputStart = yychar;
+  inputSegment.clear();
+  inputSegment.append('&');
+  yybegin(AMPERSAND);
+}
+
+"<" {
+  inputStart = yychar;
+  inputSegment.clear();
+  inputSegment.append('<');
+  yybegin(LEFT_ANGLE_BRACKET);
+}
+
+<AMPERSAND> {
+  {CharacterEntities} {
+    int length = yylength();
+    inputSegment.write(zzBuffer, zzStartRead, length);
+    entitySegment.clear();
+    char ch = entityValues.get(zzBuffer, zzStartRead, length).charValue();
+    entitySegment.append(ch);
+    outputSegment = entitySegment;
+    yybegin(CHARACTER_REFERENCE_TAIL);
+  }
+  "#" { inputSegment.append('#'); yybegin(NUMERIC_CHARACTER); }
+}
+
+<NUMERIC_CHARACTER> {
+  [xX] [0-9A-Fa-f]+ {
+    int matchLength = yylength();
+    inputSegment.write(zzBuffer, zzStartRead, matchLength);
+    if (matchLength <= 6) { // 10FFFF: max 6 hex chars
+      String hexCharRef
+          = new String(zzBuffer, zzStartRead + 1, matchLength - 1);
+      try {
+        int codePoint = Integer.parseInt(hexCharRef, 16);
+        if (codePoint <= 0x10FFFF) {
+          outputSegment = entitySegment;
+          outputSegment.clear();
+          outputSegment.setLength
+              (Character.toChars(codePoint, outputSegment.getArray(), 0));
+          yybegin(CHARACTER_REFERENCE_TAIL);
+        } else {
+          outputSegment = inputSegment;
+          yybegin(YYINITIAL);
+          return outputSegment.nextChar();
+        }
+      } catch(NumberFormatException e) {
+        assert false: "NumberFormatException parsing hex code point '"
+                      + hexCharRef + "'";
+      } catch(IllegalArgumentException e) {
+        assert false: "IllegalArgumentException getting chars "
+                      + "for hex code point '" + hexCharRef + "'";
+      }
+    } else {
+      outputSegment = inputSegment;
+      yybegin(YYINITIAL);
+      return outputSegment.nextChar();
+    }
+  }
+  [0-9]+ {
+    int matchLength = yylength();
+    inputSegment.write(zzBuffer, zzStartRead, matchLength);
+    if (matchLength <= 7) { // 0x10FFFF = 1114111: max 7 decimal chars
+      String decimalCharRef = yytext();
+      try {
+        int codePoint = Integer.parseInt(decimalCharRef);
+        if (codePoint <= 0x10FFFF) {
+          outputSegment = entitySegment;
+          outputSegment.clear();
+          outputSegment.setLength
+              (Character.toChars(codePoint, outputSegment.getArray(), 0));
+          yybegin(CHARACTER_REFERENCE_TAIL);
+        } else {
+          outputSegment = inputSegment;
+          yybegin(YYINITIAL);
+          return outputSegment.nextChar();
+        }
+      } catch(NumberFormatException e) {
+        assert false: "NumberFormatException parsing code point '"
+                      + decimalCharRef + "'";
+      } catch(IllegalArgumentException e) {
+        assert false: "IllegalArgumentException getting chars for code point '"
+                      + decimalCharRef + "'";
+      }
+    } else {
+      outputSegment = inputSegment;
+      yybegin(YYINITIAL);
+      return outputSegment.nextChar();
+    }
+  }
+}
+
+<CHARACTER_REFERENCE_TAIL> {
+  ";" {
+    cumulativeDiff
+        += inputSegment.length() + yylength() - outputSegment.length();
+    addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
+    yybegin(YYINITIAL);
+    return outputSegment.nextChar();
+  }
+}
+
+<LEFT_ANGLE_BRACKET_SLASH> {
+  \s+ { inputSegment.write(zzBuffer, zzStartRead, yylength()); }
+  [bB][rR] \s* ">" {
+    yybegin(YYINITIAL);
+    if (escapeBR) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      return outputSegment.nextChar();
+    } else {
+      cumulativeDiff
+          += inputSegment.length() + yylength() - outputSegment.length();
+      addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
+      inputSegment.reset();
+      return BR_END_TAG_REPLACEMENT;
+    }
+  }
+  {InlineElment} {
+    inputSegment.write(zzBuffer, zzStartRead, yylength());
+    if (null != escapedTags
+        && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
+      yybegin(END_TAG_TAIL_INCLUDE);
+    } else {
+      yybegin(END_TAG_TAIL_EXCLUDE);
+    }
+  }
+  {Name} {
+    inputSegment.write(zzBuffer, zzStartRead, yylength());
+    if (null != escapedTags
+        && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
+      yybegin(END_TAG_TAIL_INCLUDE);
+    } else {
+      yybegin(END_TAG_TAIL_SUBSTITUTE);
+    }
+  }
+}
+
+<END_TAG_TAIL_INCLUDE> {
+   \s* ">" {
+     inputSegment.write(zzBuffer, zzStartRead, yylength());
+     outputSegment = inputSegment;
+     yybegin(YYINITIAL);
+     return outputSegment.nextChar();
+   }
+}
+
+<END_TAG_TAIL_EXCLUDE> {
+  \s* ">" {
+    cumulativeDiff += inputSegment.length() + yylength();
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+  }
+}
+
+<END_TAG_TAIL_SUBSTITUTE> {
+  \s* ">" {
+    cumulativeDiff += inputSegment.length() + yylength() - 1;
+    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+    return BLOCK_LEVEL_END_TAG_REPLACEMENT;
+  }
+}
+
+<LEFT_ANGLE_BRACKET> {
+  "!" { inputSegment.append('!'); yybegin(BANG); }
+  "/" { inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH); }
+  \s+ {
+    inputSegment.write(zzBuffer, zzStartRead, yylength());
+    yybegin(LEFT_ANGLE_BRACKET_SPACE);
+  }
+  "?" [^>]* [/?] ">" {
+    cumulativeDiff += inputSegment.length() + yylength();
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+  }
+  \s* [bB][rR] ( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
+    yybegin(YYINITIAL);
+    if (escapeBR) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      return outputSegment.nextChar();
+    } else {
+      cumulativeDiff
+          += inputSegment.length() + yylength() - outputSegment.length();
+      addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
+      inputSegment.reset();
+      return BR_START_TAG_REPLACEMENT;
+    }
+  }
+  \s* [sS][cC][rR][iI][pP][tT] ( \s+ {OpenTagContent} )? \s*  ">" {
+    yybegin(SCRIPT);
+    if (escapeSCRIPT) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      inputStart += 1 + yylength();
+      return outputSegment.nextChar();
+    }
+  }
+  \s* [sS][tT][yY][lL][eE] ( \s+ {OpenTagContent} )? \s* ">" {
+    yybegin(STYLE);
+    if (escapeSTYLE) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      inputStart += 1 + yylength();
+      return outputSegment.nextChar();
+    }
+  }
+}
+
+<LEFT_ANGLE_BRACKET, LEFT_ANGLE_BRACKET_SPACE> {
+  {InlineElment} {
+    inputSegment.write(zzBuffer, zzStartRead, yylength());
+    if (null != escapedTags
+        && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
+      yybegin(START_TAG_TAIL_INCLUDE);
+    } else {
+      yybegin(START_TAG_TAIL_EXCLUDE);
+    }
+  }
+  {Name} {
+    inputSegment.write(zzBuffer, zzStartRead, yylength());
+    if (null != escapedTags
+        && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
+      yybegin(START_TAG_TAIL_INCLUDE);
+    } else {
+      yybegin(START_TAG_TAIL_SUBSTITUTE);
+    }
+  }
+}
+
+<START_TAG_TAIL_INCLUDE> {
+   ( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
+     inputSegment.write(zzBuffer, zzStartRead, yylength());
+     outputSegment = inputSegment;
+     yybegin(YYINITIAL);
+     return outputSegment.nextChar();
+   }
+}
+
+<START_TAG_TAIL_EXCLUDE> {
+   ( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
+    cumulativeDiff += inputSegment.length() + yylength();
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    outputSegment = inputSegment;
+    yybegin(YYINITIAL);
+  }
+}
+
+<START_TAG_TAIL_SUBSTITUTE> {
+  ( ( "="\s* | \s+ ) {OpenTagContent} )? \s*  "/"? ">" {
+    cumulativeDiff += inputSegment.length() + yylength() - 1;
+    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+    return BLOCK_LEVEL_START_TAG_REPLACEMENT;
+  }
+}
+
+<BANG> {
+  "--" { yybegin(COMMENT); }
+  ">" {
+    cumulativeDiff += inputSegment.length() + yylength();
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+  }
+  // From XML 1.0 <http://www.w3.org/TR/xml/>:
+  //
+  // [18] CDSect  ::= CDStart CData CDEnd
+  // [19] CDStart ::= '<![CDATA['
+  // [20] CData   ::= (Char* - (Char* ']]>' Char*))
+  // [21] CDEnd   ::= ']]>'
+  //
+  "[CDATA[" {
+    cumulativeDiff += inputSegment.length() + yylength();
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(CDATA);
+  }
+  [^] {
+    inputSegment.append(zzBuffer[zzStartRead]);
+  }
+}
+
+<CDATA> {
+  "]]>" {
+    cumulativeDiff += yylength();
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    yybegin(YYINITIAL);
+  }
+  [^] { return zzBuffer[zzStartRead]; }
+}
+
+<COMMENT> {
+  "<!--#" { restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
+  "-->" {
+    cumulativeDiff += yychar - inputStart + yylength();
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+  }
+  [^] { }
+}
+
+<SERVER_SIDE_INCLUDE> {
+  "-->" { yybegin(restoreState); }
+  "'" {
+    previousRestoreState = restoreState;
+    restoreState = SERVER_SIDE_INCLUDE;
+    yybegin(SINGLE_QUOTED_STRING);
+  }
+  "\"" {
+    previousRestoreState = restoreState;
+    restoreState = SERVER_SIDE_INCLUDE;
+    yybegin(DOUBLE_QUOTED_STRING);
+  }
+  [^] { }
+}
+
+<SCRIPT_COMMENT> {
+  "<!--#" { restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
+  "'"     { restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING); }
+  "\""    { restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING); }
+  "-->"   { yybegin(SCRIPT); }
+  [^] { }
+}
+
+<STYLE_COMMENT> {
+  "<!--#" { restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
+  "'"     { restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING); }
+  "\""    { restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING); }
+  "-->"   { yybegin(STYLE); }
+  [^] { }
+}
+
+<SINGLE_QUOTED_STRING> {
+  "\\" [^] { }
+  "'" { yybegin(restoreState); restoreState = previousRestoreState; }
+  [^] { }
+}
+
+<DOUBLE_QUOTED_STRING> {
+  "\\" [^] { }
+  "\"" { yybegin(restoreState); restoreState = previousRestoreState; }
+  [^] { }
+}
+
+<SCRIPT> {
+  "<!--" { yybegin(SCRIPT_COMMENT); }
+  "</" \s* [sS][cC][rR][iI][pP][tT] \s* ">" {
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+    cumulativeDiff += yychar - inputStart;
+    int outputEnd = outputCharCount;
+    int returnValue;
+    if (escapeSCRIPT) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      returnValue = outputSegment.nextChar();
+    } else {
+      cumulativeDiff += yylength() - 1;
+      ++outputEnd;
+      returnValue = SCRIPT_REPLACEMENT;
+    }
+    addOffCorrectMap(outputEnd, cumulativeDiff);
+    return returnValue;
+  }
+  [^] { }
+}
+
+<STYLE> {
+  "<!--" { yybegin(STYLE_COMMENT); }
+  "</" \s* [sS][tT][yY][lL][eE] \s* ">" {
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+    cumulativeDiff += yychar - inputStart;
+    int outputEnd = outputCharCount;
+    int returnValue;
+    if (escapeSTYLE) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      returnValue = outputSegment.nextChar();
+    } else {
+      cumulativeDiff += yylength() - 1;
+      ++outputEnd;
+      returnValue = STYLE_REPLACEMENT;
+    }
+    addOffCorrectMap(outputEnd, cumulativeDiff);
+    return returnValue;
+  }
+  [^] { }
+}
+
+<AMPERSAND,NUMERIC_CHARACTER,CHARACTER_REFERENCE_TAIL,LEFT_ANGLE_BRACKET_SLASH,END_TAG_TAIL_INCLUDE,END_TAG_TAIL_EXCLUDE,END_TAG_TAIL_SUBSTITUTE,LEFT_ANGLE_BRACKET,LEFT_ANGLE_BRACKET_SPACE,START_TAG_TAIL_INCLUDE,START_TAG_TAIL_EXCLUDE,START_TAG_TAIL_SUBSTITUTE,BANG> {
+  [^] {
+    yypushback(1);
+    outputSegment = inputSegment;
+    outputSegment.restart();
+    yybegin(YYINITIAL);
+    return outputSegment.nextChar();
+  }
+}
+
+[^] { return zzBuffer[zzStartRead]; }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py
@ -0,0 +1,530 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+# A simple python script to generate an HTML entity map and a regex alternation
+# for inclusion in HTMLStripCharFilter.jflex.
+
+def main():
+  print get_apache_license()
+  codes = {}
+  regex = re.compile(r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"')
+  for line in get_entity_text().split('\n'):
+    match = regex.match(line)
+    if match:
+      key = match.group(1)
+      if   key == 'quot': codes[key] = r'\"'
+      elif key == 'nbsp': codes[key] = ' ';
+      else              : codes[key] = r'\u%04X' % int(match.group(2))
+
+  keys = sorted(codes)
+
+  first_entry = True
+  output_line = 'CharacterEntities = ( '
+  for key in keys:
+    new_entry = ('"%s"' if first_entry else ' | "%s"') % key
+    first_entry = False
+    if len(output_line) + len(new_entry) >= 80:
+      print output_line
+      output_line = '                   '
+    output_line += new_entry
+    if key in ('quot','copy','gt','lt','reg','amp'):
+      new_entry = ' | "%s"' % key.upper()
+      if len(output_line) + len(new_entry) >= 80:
+        print output_line
+        output_line = '                   '
+      output_line += new_entry
+  print output_line, ')'
+
+  print '%{'
+  print '  private static final Set<String> upperCaseVariantsAccepted'
+  print '      = new HashSet<String>(Arrays.asList("quot","copy","gt","lt","reg","amp"));'
+  print '  private static final CharArrayMap<Character> entityValues'
+  print '      = new CharArrayMap<Character>(Version.LUCENE_40, %i, false);' % len(keys)
+  print '  static {'
+  print '    String[] entities = {'
+  output_line = '     '
+  for key in keys:
+    new_entry = ' "%s", "%s",' % (key, codes[key])
+    if len(output_line) + len(new_entry) >= 80:
+      print output_line
+      output_line = '     '
+    output_line += new_entry
+  print output_line[:-1]
+  print '    };'
+  print '    for (int i = 0 ; i < entities.length ; i += 2) {'
+  print '      Character value = entities[i + 1].charAt(0);'
+  print '      entityValues.put(entities[i], value);'
+  print '      if (upperCaseVariantsAccepted.contains(entities[i])) {'
+  print '        entityValues.put(entities[i].toUpperCase(), value);'
+  print '      }'
+  print '    }'
+  print "  }"
+  print "%}"
+
+def get_entity_text():
+# The text below is taken verbatim from
+# <http://www.w3.org/TR/REC-html40/sgml/entities.html>:
+  text = r"""
+F.1. XHTML Character Entities
+
+XHTML DTDs make available a standard collection of named character entities. Those entities are defined in this section.
+F.1.1. XHTML Latin 1 Character Entities
+
+You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-lat1.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent.
+
+<!-- ...................................................................... -->
+<!-- XML-compatible ISO Latin 1 Character Entity Set for XHTML ............ -->
+<!-- file: xhtml-lat1.ent
+
+     Typical invocation:
+
+       <!ENTITY % xhtml-lat1
+           PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
+                  "xhtml-lat1.ent" >
+       %xhtml-lat1;
+
+     This DTD module is identified by the PUBLIC and SYSTEM identifiers:
+
+       PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
+       SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent"
+
+     Revision:  $Id: xhtml-lat1.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
+
+     Portions (C) International Organization for Standardization 1986:
+     Permission to copy in any form is granted for use with conforming
+     SGML systems and applications as defined in ISO 8879, provided
+     this notice is included in all copies.
+-->
+
+<!ENTITY nbsp   "&#160;" ><!-- no-break space = non-breaking space, U+00A0 ISOnum -->
+<!ENTITY iexcl  "&#161;" ><!-- inverted exclamation mark, U+00A1 ISOnum -->
+<!ENTITY cent   "&#162;" ><!-- cent sign, U+00A2 ISOnum -->
+<!ENTITY pound  "&#163;" ><!-- pound sign, U+00A3 ISOnum -->
+<!ENTITY curren "&#164;" ><!-- currency sign, U+00A4 ISOnum -->
+<!ENTITY yen    "&#165;" ><!-- yen sign = yuan sign, U+00A5 ISOnum -->
+<!ENTITY brvbar "&#166;" ><!-- broken bar = broken vertical bar, U+00A6 ISOnum -->
+<!ENTITY sect   "&#167;" ><!-- section sign, U+00A7 ISOnum -->
+<!ENTITY uml    "&#168;" ><!-- diaeresis = spacing diaeresis, U+00A8 ISOdia -->
+<!ENTITY copy   "&#169;" ><!-- copyright sign, U+00A9 ISOnum -->
+<!ENTITY ordf   "&#170;" ><!-- feminine ordinal indicator, U+00AA ISOnum -->
+<!ENTITY laquo  "&#171;" ><!-- left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum -->
+<!ENTITY not    "&#172;" ><!-- not sign, U+00AC ISOnum -->
+<!ENTITY shy    "&#173;" ><!-- soft hyphen = discretionary hyphen, U+00AD ISOnum -->
+<!ENTITY reg    "&#174;" ><!-- registered sign = registered trade mark sign, U+00AE ISOnum -->
+<!ENTITY macr   "&#175;" ><!-- macron = spacing macron = overline = APL overbar, U+00AF ISOdia -->
+<!ENTITY deg    "&#176;" ><!-- degree sign, U+00B0 ISOnum -->
+<!ENTITY plusmn "&#177;" ><!-- plus-minus sign = plus-or-minus sign, U+00B1 ISOnum -->
+<!ENTITY sup2   "&#178;" ><!-- superscript two = superscript digit two = squared, U+00B2 ISOnum -->
+<!ENTITY sup3   "&#179;" ><!-- superscript three = superscript digit three = cubed, U+00B3 ISOnum -->
+<!ENTITY acute  "&#180;" ><!-- acute accent = spacing acute, U+00B4 ISOdia -->
+<!ENTITY micro  "&#181;" ><!-- micro sign, U+00B5 ISOnum -->
+<!ENTITY para   "&#182;" ><!-- pilcrow sign = paragraph sign, U+00B6 ISOnum -->
+<!ENTITY middot "&#183;" ><!-- middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum -->
+<!ENTITY cedil  "&#184;" ><!-- cedilla = spacing cedilla, U+00B8 ISOdia -->
+<!ENTITY sup1   "&#185;" ><!-- superscript one = superscript digit one, U+00B9 ISOnum -->
+<!ENTITY ordm   "&#186;" ><!-- masculine ordinal indicator, U+00BA ISOnum -->
+<!ENTITY raquo  "&#187;" ><!-- right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum -->
+<!ENTITY frac14 "&#188;" ><!-- vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum -->
+<!ENTITY frac12 "&#189;" ><!-- vulgar fraction one half = fraction one half, U+00BD ISOnum -->
+<!ENTITY frac34 "&#190;" ><!-- vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum -->
+<!ENTITY iquest "&#191;" ><!-- inverted question mark = turned question mark, U+00BF ISOnum -->
+<!ENTITY Agrave "&#192;" ><!-- latin capital A with grave = latin capital A grave, U+00C0 ISOlat1 -->
+<!ENTITY Aacute "&#193;" ><!-- latin capital A with acute, U+00C1 ISOlat1 -->
+<!ENTITY Acirc  "&#194;" ><!-- latin capital A with circumflex, U+00C2 ISOlat1 -->
+<!ENTITY Atilde "&#195;" ><!-- latin capital A with tilde, U+00C3 ISOlat1 -->
+<!ENTITY Auml   "&#196;" ><!-- latin capital A with diaeresis, U+00C4 ISOlat1 -->
+<!ENTITY Aring  "&#197;" ><!-- latin capital A with ring above = latin capital A ring, U+00C5 ISOlat1 -->
+<!ENTITY AElig  "&#198;" ><!-- latin capital AE = latin capital ligature AE, U+00C6 ISOlat1 -->
+<!ENTITY Ccedil "&#199;" ><!-- latin capital C with cedilla, U+00C7 ISOlat1 -->
+<!ENTITY Egrave "&#200;" ><!-- latin capital E with grave, U+00C8 ISOlat1 -->
+<!ENTITY Eacute "&#201;" ><!-- latin capital E with acute, U+00C9 ISOlat1 -->
+<!ENTITY Ecirc  "&#202;" ><!-- latin capital E with circumflex, U+00CA ISOlat1 -->
+<!ENTITY Euml   "&#203;" ><!-- latin capital E with diaeresis, U+00CB ISOlat1 -->
+<!ENTITY Igrave "&#204;" ><!-- latin capital I with grave, U+00CC ISOlat1 -->
+<!ENTITY Iacute "&#205;" ><!-- latin capital I with acute, U+00CD ISOlat1 -->
+<!ENTITY Icirc  "&#206;" ><!-- latin capital I with circumflex, U+00CE ISOlat1 -->
+<!ENTITY Iuml   "&#207;" ><!-- latin capital I with diaeresis, U+00CF ISOlat1 -->
+<!ENTITY ETH    "&#208;" ><!-- latin capital ETH, U+00D0 ISOlat1 -->
+<!ENTITY Ntilde "&#209;" ><!-- latin capital N with tilde, U+00D1 ISOlat1 -->
+<!ENTITY Ograve "&#210;" ><!-- latin capital O with grave, U+00D2 ISOlat1 -->
+<!ENTITY Oacute "&#211;" ><!-- latin capital O with acute, U+00D3 ISOlat1 -->
+<!ENTITY Ocirc  "&#212;" ><!-- latin capital O with circumflex, U+00D4 ISOlat1 -->
+<!ENTITY Otilde "&#213;" ><!-- latin capital O with tilde, U+00D5 ISOlat1 -->
+<!ENTITY Ouml   "&#214;" ><!-- latin capital O with diaeresis, U+00D6 ISOlat1 -->
+<!ENTITY times  "&#215;" ><!-- multiplication sign, U+00D7 ISOnum -->
+<!ENTITY Oslash "&#216;" ><!-- latin capital O with stroke = latin capital O slash, U+00D8 ISOlat1 -->
+<!ENTITY Ugrave "&#217;" ><!-- latin capital U with grave, U+00D9 ISOlat1 -->
+<!ENTITY Uacute "&#218;" ><!-- latin capital U with acute, U+00DA ISOlat1 -->
+<!ENTITY Ucirc  "&#219;" ><!-- latin capital U with circumflex, U+00DB ISOlat1 -->
+<!ENTITY Uuml   "&#220;" ><!-- latin capital U with diaeresis, U+00DC ISOlat1 -->
+<!ENTITY Yacute "&#221;" ><!-- latin capital Y with acute, U+00DD ISOlat1 -->
+<!ENTITY THORN  "&#222;" ><!-- latin capital THORN, U+00DE ISOlat1 -->
+<!ENTITY szlig  "&#223;" ><!-- latin small sharp s = ess-zed, U+00DF ISOlat1 -->
+<!ENTITY agrave "&#224;" ><!-- latin small a with grave = latin small a grave, U+00E0 ISOlat1 -->
+<!ENTITY aacute "&#225;" ><!-- latin small a with acute, U+00E1 ISOlat1 -->
+<!ENTITY acirc  "&#226;" ><!-- latin small a with circumflex, U+00E2 ISOlat1 -->
+<!ENTITY atilde "&#227;" ><!-- latin small a with tilde, U+00E3 ISOlat1 -->
+<!ENTITY auml   "&#228;" ><!-- latin small a with diaeresis, U+00E4 ISOlat1 -->
+<!ENTITY aring  "&#229;" ><!-- latin small a with ring above = latin small a ring, U+00E5 ISOlat1 -->
+<!ENTITY aelig  "&#230;" ><!-- latin small ae = latin small ligature ae, U+00E6 ISOlat1 -->
+<!ENTITY ccedil "&#231;" ><!-- latin small c with cedilla, U+00E7 ISOlat1 -->
+<!ENTITY egrave "&#232;" ><!-- latin small e with grave, U+00E8 ISOlat1 -->
+<!ENTITY eacute "&#233;" ><!-- latin small e with acute, U+00E9 ISOlat1 -->
+<!ENTITY ecirc  "&#234;" ><!-- latin small e with circumflex, U+00EA ISOlat1 -->
+<!ENTITY euml   "&#235;" ><!-- latin small e with diaeresis, U+00EB ISOlat1 -->
+<!ENTITY igrave "&#236;" ><!-- latin small i with grave, U+00EC ISOlat1 -->
+<!ENTITY iacute "&#237;" ><!-- latin small i with acute, U+00ED ISOlat1 -->
+<!ENTITY icirc  "&#238;" ><!-- latin small i with circumflex, U+00EE ISOlat1 -->
+<!ENTITY iuml   "&#239;" ><!-- latin small i with diaeresis, U+00EF ISOlat1 -->
+<!ENTITY eth    "&#240;" ><!-- latin small eth, U+00F0 ISOlat1 -->
+<!ENTITY ntilde "&#241;" ><!-- latin small n with tilde, U+00F1 ISOlat1 -->
+<!ENTITY ograve "&#242;" ><!-- latin small o with grave, U+00F2 ISOlat1 -->
+<!ENTITY oacute "&#243;" ><!-- latin small o with acute, U+00F3 ISOlat1 -->
+<!ENTITY ocirc  "&#244;" ><!-- latin small o with circumflex, U+00F4 ISOlat1 -->
+<!ENTITY otilde "&#245;" ><!-- latin small o with tilde, U+00F5 ISOlat1 -->
+<!ENTITY ouml   "&#246;" ><!-- latin small o with diaeresis, U+00F6 ISOlat1 -->
+<!ENTITY divide "&#247;" ><!-- division sign, U+00F7 ISOnum -->
+<!ENTITY oslash "&#248;" ><!-- latin small o with stroke, = latin small o slash, U+00F8 ISOlat1 -->
+<!ENTITY ugrave "&#249;" ><!-- latin small u with grave, U+00F9 ISOlat1 -->
+<!ENTITY uacute "&#250;" ><!-- latin small u with acute, U+00FA ISOlat1 -->
+<!ENTITY ucirc  "&#251;" ><!-- latin small u with circumflex, U+00FB ISOlat1 -->
+<!ENTITY uuml   "&#252;" ><!-- latin small u with diaeresis, U+00FC ISOlat1 -->
+<!ENTITY yacute "&#253;" ><!-- latin small y with acute, U+00FD ISOlat1 -->
+<!ENTITY thorn  "&#254;" ><!-- latin small thorn with, U+00FE ISOlat1 -->
+<!ENTITY yuml   "&#255;" ><!-- latin small y with diaeresis, U+00FF ISOlat1 -->
+<!-- end of xhtml-lat1.ent -->
+
+F.1.2. XHTML Special Characters
+
+You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-special.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-special.ent.
+
+<!-- ...................................................................... -->
+<!-- XML-compatible ISO Special Character Entity Set for XHTML ............ -->
+<!-- file: xhtml-special.ent
+
+     Typical invocation:
+
+       <!ENTITY % xhtml-special
+           PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
+                  "xhtml-special.ent" >
+       %xhtml-special;
+
+     This DTD module is identified by the PUBLIC and SYSTEM identifiers:
+
+       PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
+       SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-special.ent"
+
+     Revision:  $Id: xhtml-special.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
+
+     Portions (C) International Organization for Standardization 1986:
+     Permission to copy in any form is granted for use with conforming
+     SGML systems and applications as defined in ISO 8879, provided
+     this notice is included in all copies.
+
+     Revisions:
+2000-10-28: added &apos; and altered XML Predefined Entities for compatibility
+-->
+
+<!-- Relevant ISO entity set is given unless names are newly introduced.
+     New names (i.e., not in ISO 8879 [SGML] list) do not clash with
+     any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
+     numbers are given for each character, in hex. Entity values are
+     decimal conversions of the ISO 10646 values and refer to the
+     document character set. Names are Unicode [UNICODE] names.
+-->
+
+<!-- C0 Controls and Basic Latin -->
+<!ENTITY lt      "&#38;#60;" ><!-- less-than sign, U+003C ISOnum -->
+<!ENTITY gt      "&#62;" ><!-- greater-than sign, U+003E ISOnum -->
+<!ENTITY amp     "&#38;#38;" ><!-- ampersand, U+0026 ISOnum -->
+<!ENTITY apos    "&#39;" ><!-- The Apostrophe (Apostrophe Quote, APL Quote), U+0027 ISOnum -->
+<!ENTITY quot    "&#34;" ><!-- quotation mark (Quote Double), U+0022 ISOnum -->
+
+<!-- Latin Extended-A -->
+<!ENTITY OElig   "&#338;" ><!-- latin capital ligature OE, U+0152 ISOlat2 -->
+<!ENTITY oelig   "&#339;" ><!-- latin small ligature oe, U+0153 ISOlat2 -->
+
+<!-- ligature is a misnomer, this is a separate character in some languages -->
+<!ENTITY Scaron  "&#352;" ><!-- latin capital letter S with caron, U+0160 ISOlat2 -->
+<!ENTITY scaron  "&#353;" ><!-- latin small letter s with caron, U+0161 ISOlat2 -->
+<!ENTITY Yuml    "&#376;" ><!-- latin capital letter Y with diaeresis, U+0178 ISOlat2 -->
+
+<!-- Spacing Modifier Letters -->
+<!ENTITY circ    "&#710;" ><!-- modifier letter circumflex accent, U+02C6 ISOpub -->
+<!ENTITY tilde   "&#732;" ><!-- small tilde, U+02DC ISOdia -->
+
+<!-- General Punctuation -->
+<!ENTITY ensp    "&#8194;" ><!-- en space, U+2002 ISOpub -->
+<!ENTITY emsp    "&#8195;" ><!-- em space, U+2003 ISOpub -->
+<!ENTITY thinsp  "&#8201;" ><!-- thin space, U+2009 ISOpub -->
+<!ENTITY zwnj    "&#8204;" ><!-- zero width non-joiner, U+200C NEW RFC 2070 -->
+<!ENTITY zwj     "&#8205;" ><!-- zero width joiner, U+200D NEW RFC 2070 -->
+<!ENTITY lrm     "&#8206;" ><!-- left-to-right mark, U+200E NEW RFC 2070 -->
+<!ENTITY rlm     "&#8207;" ><!-- right-to-left mark, U+200F NEW RFC 2070 -->
+<!ENTITY ndash   "&#8211;" ><!-- en dash, U+2013 ISOpub -->
+<!ENTITY mdash   "&#8212;" ><!-- em dash, U+2014 ISOpub -->
+<!ENTITY lsquo   "&#8216;" ><!-- left single quotation mark, U+2018 ISOnum -->
+<!ENTITY rsquo   "&#8217;" ><!-- right single quotation mark, U+2019 ISOnum -->
+<!ENTITY sbquo   "&#8218;" ><!-- single low-9 quotation mark, U+201A NEW -->
+<!ENTITY ldquo   "&#8220;" ><!-- left double quotation mark, U+201C ISOnum -->
+<!ENTITY rdquo   "&#8221;" ><!-- right double quotation mark, U+201D ISOnum -->
+<!ENTITY bdquo   "&#8222;" ><!-- double low-9 quotation mark, U+201E NEW -->
+<!ENTITY dagger  "&#8224;" ><!-- dagger, U+2020 ISOpub -->
+<!ENTITY Dagger  "&#8225;" ><!-- double dagger, U+2021 ISOpub -->
+<!ENTITY permil  "&#8240;" ><!-- per mille sign, U+2030 ISOtech -->
+
+<!-- lsaquo is proposed but not yet ISO standardized -->
+<!ENTITY lsaquo  "&#8249;" ><!-- single left-pointing angle quotation mark, U+2039 ISO proposed -->
+<!-- rsaquo is proposed but not yet ISO standardized -->
+<!ENTITY rsaquo  "&#8250;" ><!-- single right-pointing angle quotation mark, U+203A ISO proposed -->
+<!ENTITY euro    "&#8364;" ><!-- euro sign, U+20AC NEW -->
+
+<!-- end of xhtml-special.ent -->
+
+F.1.3. XHTML Mathematical, Greek, and Symbolic Characters
+
+You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-symbol.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent.
+
+<!-- ...................................................................... -->
+<!-- ISO Math, Greek and Symbolic Character Entity Set for XHTML .......... -->
+<!-- file: xhtml-symbol.ent
+
+     Typical invocation:
+
+       <!ENTITY % xhtml-symbol
+           PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
+                  "xhtml-symbol.ent" >
+       %xhtml-symbol;
+
+     This DTD module is identified by the PUBLIC and SYSTEM identifiers:
+
+       PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
+       SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent"
+
+     Revision:  $Id: xhtml-symbol.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
+
+     Portions (C) International Organization for Standardization 1986:
+     Permission to copy in any form is granted for use with conforming
+     SGML systems and applications as defined in ISO 8879, provided
+     this notice is included in all copies.
+-->
+
+<!-- Relevant ISO entity set is given unless names are newly introduced.
+     New names (i.e., not in ISO 8879 [SGML] list) do not clash with
+     any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
+     numbers are given for each character, in hex. Entity values are
+     decimal conversions of the ISO 10646 values and refer to the
+     document character set. Names are Unicode [UNICODE] names.
+-->
+
+<!-- Latin Extended-B -->
+<!ENTITY fnof     "&#402;" ><!-- latin small f with hook = function
+                              = florin, U+0192 ISOtech -->
+
+<!-- Greek -->
+<!ENTITY Alpha    "&#913;" ><!-- greek capital letter alpha, U+0391 -->
+<!ENTITY Beta     "&#914;" ><!-- greek capital letter beta, U+0392 -->
+<!ENTITY Gamma    "&#915;" ><!-- greek capital letter gamma, U+0393 ISOgrk3 -->
+<!ENTITY Delta    "&#916;" ><!-- greek capital letter delta, U+0394 ISOgrk3 -->
+<!ENTITY Epsilon  "&#917;" ><!-- greek capital letter epsilon, U+0395 -->
+<!ENTITY Zeta     "&#918;" ><!-- greek capital letter zeta, U+0396 -->
+<!ENTITY Eta      "&#919;" ><!-- greek capital letter eta, U+0397 -->
+<!ENTITY Theta    "&#920;" ><!-- greek capital letter theta, U+0398 ISOgrk3 -->
+<!ENTITY Iota     "&#921;" ><!-- greek capital letter iota, U+0399 -->
+<!ENTITY Kappa    "&#922;" ><!-- greek capital letter kappa, U+039A -->
+<!ENTITY Lambda   "&#923;" ><!-- greek capital letter lambda, U+039B ISOgrk3 -->
+<!ENTITY Mu       "&#924;" ><!-- greek capital letter mu, U+039C -->
+<!ENTITY Nu       "&#925;" ><!-- greek capital letter nu, U+039D -->
+<!ENTITY Xi       "&#926;" ><!-- greek capital letter xi, U+039E ISOgrk3 -->
+<!ENTITY Omicron  "&#927;" ><!-- greek capital letter omicron, U+039F -->
+<!ENTITY Pi       "&#928;" ><!-- greek capital letter pi, U+03A0 ISOgrk3 -->
+<!ENTITY Rho      "&#929;" ><!-- greek capital letter rho, U+03A1 -->
+<!-- there is no Sigmaf, and no U+03A2 character either -->
+<!ENTITY Sigma    "&#931;" ><!-- greek capital letter sigma, U+03A3 ISOgrk3 -->
+<!ENTITY Tau      "&#932;" ><!-- greek capital letter tau, U+03A4 -->
+<!ENTITY Upsilon  "&#933;" ><!-- greek capital letter upsilon,
+                              U+03A5 ISOgrk3 -->
+<!ENTITY Phi      "&#934;" ><!-- greek capital letter phi, U+03A6 ISOgrk3 -->
+<!ENTITY Chi      "&#935;" ><!-- greek capital letter chi, U+03A7 -->
+<!ENTITY Psi      "&#936;" ><!-- greek capital letter psi, U+03A8 ISOgrk3 -->
+<!ENTITY Omega    "&#937;" ><!-- greek capital letter omega, U+03A9 ISOgrk3 -->
+<!ENTITY alpha    "&#945;" ><!-- greek small letter alpha, U+03B1 ISOgrk3 -->
+<!ENTITY beta     "&#946;" ><!-- greek small letter beta, U+03B2 ISOgrk3 -->
+<!ENTITY gamma    "&#947;" ><!-- greek small letter gamma, U+03B3 ISOgrk3 -->
+<!ENTITY delta    "&#948;" ><!-- greek small letter delta, U+03B4 ISOgrk3 -->
+<!ENTITY epsilon  "&#949;" ><!-- greek small letter epsilon, U+03B5 ISOgrk3 -->
+<!ENTITY zeta     "&#950;" ><!-- greek small letter zeta, U+03B6 ISOgrk3 -->
+<!ENTITY eta      "&#951;" ><!-- greek small letter eta, U+03B7 ISOgrk3 -->
+<!ENTITY theta    "&#952;" ><!-- greek small letter theta, U+03B8 ISOgrk3 -->
+<!ENTITY iota     "&#953;" ><!-- greek small letter iota, U+03B9 ISOgrk3 -->
+<!ENTITY kappa    "&#954;" ><!-- greek small letter kappa, U+03BA ISOgrk3 -->
+<!ENTITY lambda   "&#955;" ><!-- greek small letter lambda, U+03BB ISOgrk3 -->
+<!ENTITY mu       "&#956;" ><!-- greek small letter mu, U+03BC ISOgrk3 -->
+<!ENTITY nu       "&#957;" ><!-- greek small letter nu, U+03BD ISOgrk3 -->
+<!ENTITY xi       "&#958;" ><!-- greek small letter xi, U+03BE ISOgrk3 -->
+<!ENTITY omicron  "&#959;" ><!-- greek small letter omicron, U+03BF NEW -->
+<!ENTITY pi       "&#960;" ><!-- greek small letter pi, U+03C0 ISOgrk3 -->
+<!ENTITY rho      "&#961;" ><!-- greek small letter rho, U+03C1 ISOgrk3 -->
+<!ENTITY sigmaf   "&#962;" ><!-- greek small letter final sigma, U+03C2 ISOgrk3 -->
+<!ENTITY sigma    "&#963;" ><!-- greek small letter sigma, U+03C3 ISOgrk3 -->
+<!ENTITY tau      "&#964;" ><!-- greek small letter tau, U+03C4 ISOgrk3 -->
+<!ENTITY upsilon  "&#965;" ><!-- greek small letter upsilon, U+03C5 ISOgrk3 -->
+<!ENTITY phi      "&#966;" ><!-- greek small letter phi, U+03C6 ISOgrk3 -->
+<!ENTITY chi      "&#967;" ><!-- greek small letter chi, U+03C7 ISOgrk3 -->
+<!ENTITY psi      "&#968;" ><!-- greek small letter psi, U+03C8 ISOgrk3 -->
+<!ENTITY omega    "&#969;" ><!-- greek small letter omega, U+03C9 ISOgrk3 -->
+<!ENTITY thetasym "&#977;" ><!-- greek small letter theta symbol, U+03D1 NEW -->
+<!ENTITY upsih    "&#978;" ><!-- greek upsilon with hook symbol, U+03D2 NEW -->
+<!ENTITY piv      "&#982;" ><!-- greek pi symbol, U+03D6 ISOgrk3 -->
+
+<!-- General Punctuation -->
+<!ENTITY bull     "&#8226;" ><!-- bullet = black small circle, U+2022 ISOpub  -->
+<!-- bullet is NOT the same as bullet operator, U+2219 -->
+<!ENTITY hellip   "&#8230;" ><!-- horizontal ellipsis = three dot leader, U+2026 ISOpub  -->
+<!ENTITY prime    "&#8242;" ><!-- prime = minutes = feet, U+2032 ISOtech -->
+<!ENTITY Prime    "&#8243;" ><!-- double prime = seconds = inches, U+2033 ISOtech -->
+<!ENTITY oline    "&#8254;" ><!-- overline = spacing overscore, U+203E NEW -->
+<!ENTITY frasl    "&#8260;" ><!-- fraction slash, U+2044 NEW -->
+
+<!-- Letterlike Symbols -->
+<!ENTITY weierp   "&#8472;" ><!-- script capital P = power set = Weierstrass p, U+2118 ISOamso -->
+<!ENTITY image    "&#8465;" ><!-- blackletter capital I = imaginary part, U+2111 ISOamso -->
+<!ENTITY real     "&#8476;" ><!-- blackletter capital R = real part symbol, U+211C ISOamso -->
+<!ENTITY trade    "&#8482;" ><!-- trade mark sign, U+2122 ISOnum -->
+<!ENTITY alefsym  "&#8501;" ><!-- alef symbol = first transfinite cardinal, U+2135 NEW -->
+<!-- alef symbol is NOT the same as hebrew letter alef, U+05D0 although
+     the same glyph could be used to depict both characters -->
+
+<!-- Arrows -->
+<!ENTITY larr     "&#8592;" ><!-- leftwards arrow, U+2190 ISOnum -->
+<!ENTITY uarr     "&#8593;" ><!-- upwards arrow, U+2191 ISOnum-->
+<!ENTITY rarr     "&#8594;" ><!-- rightwards arrow, U+2192 ISOnum -->
+<!ENTITY darr     "&#8595;" ><!-- downwards arrow, U+2193 ISOnum -->
+<!ENTITY harr     "&#8596;" ><!-- left right arrow, U+2194 ISOamsa -->
+<!ENTITY crarr    "&#8629;" ><!-- downwards arrow with corner leftwards
+                               = carriage return, U+21B5 NEW -->
+<!ENTITY lArr     "&#8656;" ><!-- leftwards double arrow, U+21D0 ISOtech -->
+<!-- Unicode does not say that lArr is the same as the 'is implied by' arrow
+    but also does not have any other character for that function. So ? lArr can
+    be used for 'is implied by' as ISOtech suggests -->
+<!ENTITY uArr     "&#8657;" ><!-- upwards double arrow, U+21D1 ISOamsa -->
+<!ENTITY rArr     "&#8658;" ><!-- rightwards double arrow, U+21D2 ISOtech -->
+<!-- Unicode does not say this is the 'implies' character but does not have
+     another character with this function so ?
+     rArr can be used for 'implies' as ISOtech suggests -->
+<!ENTITY dArr     "&#8659;" ><!-- downwards double arrow, U+21D3 ISOamsa -->
+<!ENTITY hArr     "&#8660;" ><!-- left right double arrow, U+21D4 ISOamsa -->
+
+<!-- Mathematical Operators -->
+<!ENTITY forall   "&#8704;" ><!-- for all, U+2200 ISOtech -->
+<!ENTITY part     "&#8706;" ><!-- partial differential, U+2202 ISOtech  -->
+<!ENTITY exist    "&#8707;" ><!-- there exists, U+2203 ISOtech -->
+<!ENTITY empty    "&#8709;" ><!-- empty set = null set, U+2205 ISOamso -->
+<!ENTITY nabla    "&#8711;" ><!-- nabla = backward difference, U+2207 ISOtech -->
+<!ENTITY isin     "&#8712;" ><!-- element of, U+2208 ISOtech -->
+<!ENTITY notin    "&#8713;" ><!-- not an element of, U+2209 ISOtech -->
+<!ENTITY ni       "&#8715;" ><!-- contains as member, U+220B ISOtech -->
+<!-- should there be a more memorable name than 'ni'? -->
+<!ENTITY prod     "&#8719;" ><!-- n-ary product = product sign, U+220F ISOamsb -->
+<!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though
+     the same glyph might be used for both -->
+<!ENTITY sum      "&#8721;" ><!-- n-ary sumation, U+2211 ISOamsb -->
+<!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
+     though the same glyph might be used for both -->
+<!ENTITY minus    "&#8722;" ><!-- minus sign, U+2212 ISOtech -->
+<!ENTITY lowast   "&#8727;" ><!-- asterisk operator, U+2217 ISOtech -->
+<!ENTITY radic    "&#8730;" ><!-- square root = radical sign, U+221A ISOtech -->
+<!ENTITY prop     "&#8733;" ><!-- proportional to, U+221D ISOtech -->
+<!ENTITY infin    "&#8734;" ><!-- infinity, U+221E ISOtech -->
+<!ENTITY ang      "&#8736;" ><!-- angle, U+2220 ISOamso -->
+<!ENTITY and      "&#8743;" ><!-- logical and = wedge, U+2227 ISOtech -->
+<!ENTITY or       "&#8744;" ><!-- logical or = vee, U+2228 ISOtech -->
+<!ENTITY cap      "&#8745;" ><!-- intersection = cap, U+2229 ISOtech -->
+<!ENTITY cup      "&#8746;" ><!-- union = cup, U+222A ISOtech -->
+<!ENTITY int      "&#8747;" ><!-- integral, U+222B ISOtech -->
+<!ENTITY there4   "&#8756;" ><!-- therefore, U+2234 ISOtech -->
+<!ENTITY sim      "&#8764;" ><!-- tilde operator = varies with = similar to, U+223C ISOtech -->
+<!-- tilde operator is NOT the same character as the tilde, U+007E,
+     although the same glyph might be used to represent both  -->
+<!ENTITY cong     "&#8773;" ><!-- approximately equal to, U+2245 ISOtech -->
+<!ENTITY asymp    "&#8776;" ><!-- almost equal to = asymptotic to, U+2248 ISOamsr -->
+<!ENTITY ne       "&#8800;" ><!-- not equal to, U+2260 ISOtech -->
+<!ENTITY equiv    "&#8801;" ><!-- identical to, U+2261 ISOtech -->
+<!ENTITY le       "&#8804;" ><!-- less-than or equal to, U+2264 ISOtech -->
+<!ENTITY ge       "&#8805;" ><!-- greater-than or equal to, U+2265 ISOtech -->
+<!ENTITY sub      "&#8834;" ><!-- subset of, U+2282 ISOtech -->
+<!ENTITY sup      "&#8835;" ><!-- superset of, U+2283 ISOtech -->
+<!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol
+     font encoding and is not included. Should it be, for symmetry?
+     It is in ISOamsn  -->
+<!ENTITY nsub     "&#8836;" ><!-- not a subset of, U+2284 ISOamsn -->
+<!ENTITY sube     "&#8838;" ><!-- subset of or equal to, U+2286 ISOtech -->
+<!ENTITY supe     "&#8839;" ><!-- superset of or equal to, U+2287 ISOtech -->
+<!ENTITY oplus    "&#8853;" ><!-- circled plus = direct sum, U+2295 ISOamsb -->
+<!ENTITY otimes   "&#8855;" ><!-- circled times = vector product, U+2297 ISOamsb -->
+<!ENTITY perp     "&#8869;" ><!-- up tack = orthogonal to = perpendicular, U+22A5 ISOtech -->
+<!ENTITY sdot     "&#8901;" ><!-- dot operator, U+22C5 ISOamsb -->
+<!-- dot operator is NOT the same character as U+00B7 middle dot -->
+
+<!-- Miscellaneous Technical -->
+<!ENTITY lceil    "&#8968;" ><!-- left ceiling = apl upstile, U+2308 ISOamsc  -->
+<!ENTITY rceil    "&#8969;" ><!-- right ceiling, U+2309 ISOamsc  -->
+<!ENTITY lfloor   "&#8970;" ><!-- left floor = apl downstile, U+230A ISOamsc  -->
+<!ENTITY rfloor   "&#8971;" ><!-- right floor, U+230B ISOamsc  -->
+<!ENTITY lang     "&#9001;" ><!-- left-pointing angle bracket = bra, U+2329 ISOtech -->
+<!-- lang is NOT the same character as U+003C 'less than'
+     or U+2039 'single left-pointing angle quotation mark' -->
+<!ENTITY rang     "&#9002;" ><!-- right-pointing angle bracket = ket, U+232A ISOtech -->
+<!-- rang is NOT the same character as U+003E 'greater than'
+     or U+203A 'single right-pointing angle quotation mark' -->
+
+<!-- Geometric Shapes -->
+<!ENTITY loz      "&#9674;" ><!-- lozenge, U+25CA ISOpub -->
+
+<!-- Miscellaneous Symbols -->
+<!ENTITY spades   "&#9824;" ><!-- black spade suit, U+2660 ISOpub -->
+<!-- black here seems to mean filled as opposed to hollow -->
+<!ENTITY clubs    "&#9827;" ><!-- black club suit = shamrock, U+2663 ISOpub -->
+<!ENTITY hearts   "&#9829;" ><!-- black heart suit = valentine, U+2665 ISOpub -->
+<!ENTITY diams    "&#9830;" ><!-- black diamond suit, U+2666 ISOpub -->
+
+<!-- end of xhtml-symbol.ent -->
+"""
+  return text
+
+def get_apache_license():
+  license = r"""/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+"""
+  return license
+
+main()
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/package.html
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/package.html
@ -17,6 +17,42 @@
 -->
 <html><head></head>
 <body>
-Filters that normalize text before tokenization.
+<p>
+  Chainable filters that normalize text before tokenization and provide
+  mappings between normalized text offsets and the corresponding offset
+  in the original text.
+</p>
+<H2>CharFilter offset mappings</H2>
+<p>
+  CharFilters modify an input stream via a series of substring
+  replacements (including deletions and insertions) to produce an output
+  stream. There are three possible replacement cases: the replacement
+  string has the same length as the original substring; the replacement
+  is shorter; and the replacement is longer. In the latter two cases
+  (when the replacement has a different length than the original),
+  one or more offset correction mappings are required.
+</p>
+<p>
+  When the replacement is shorter than the original (e.g. when the
+  replacement is the empty string), a single offset correction mapping
+  should be added at the replacement's end offset in the output stream.
+  The <code>cumulativeDiff</code> parameter to the
+  <code>addOffCorrectMapping()</code> method will be the sum of all
+  previous replacement offset adjustments, with the addition of the
+  difference between the lengths of the original substring and the
+  replacement string (a positive value).
+</p>
+<p>
+  When the replacement is longer than the original (e.g. when the
+  original is the empty string), you should add as many offset
+  correction mappings as the difference between the lengths of the
+  replacement string and the original substring, starting at the
+  end offset the original substring would have had in the output stream.
+  The <code>cumulativeDiff</code> parameter to the
+  <code>addOffCorrectMapping()</code> method will be the sum of all
+  previous replacement offset adjustments, with the addition of the
+  difference between the lengths of the original substring and the
+  replacement string so far (a negative value).
+</p>
 </body>
 </html>
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
@ -23,6 +23,7 @@ import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.StringReader;
+import java.util.Arrays;
 import java.util.HashSet;
 import java.util.Set;

@ -31,7 +32,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CharReader;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Tokenizer;
-import org.junit.Ignore;
+import org.apache.lucene.util._TestUtil;

 public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {

@ -41,8 +42,8 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
    String html = "<div class=\"foo\">this is some text</div> here is a <a href=\"#bar\">link</a> and " +
            "another <a href=\"http://lucene.apache.org/\">link</a>. " +
            "This is an entity: &amp; plus a &lt;.  Here is an &. <!-- is a comment -->";
-    String gold = " this is some text  here is a  link  and " +
-            "another  link . " +
+    String gold = "\nthis is some text\n here is a link and " +
+            "another link. " +
            "This is an entity: & plus a <.  Here is an &. ";
    HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new StringReader(html)));
    StringBuilder builder = new StringBuilder();
@ -56,7 +57,8 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
              + " Buffer so far: " + builder + "<EOB>", theChar == goldArray[position]);
      position++;
    }
-    assertEquals(gold, builder.toString());
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
  }

  //Some sanity checks, but not a full-fledged check
@ -77,6 +79,24 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
    
  }

+  public void testMSWord14GeneratedHTML() throws Exception {
+    InputStream stream = getClass().getResourceAsStream("MS-Word 14 generated.htm");
+    HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new InputStreamReader(stream, "UTF-8")));
+    String gold = "This is a test";
+    StringBuilder builder = new StringBuilder();
+    int ch = 0;
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString().trim() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString().trim());
+  }
+  
+  
  public void testGamma() throws Exception {
    String test = "&Gamma;";
    String gold = "\u0393";
@ -89,9 +109,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
      builder.append((char)ch);
    }
    String result = builder.toString();
-    // System.out.println("Resu: " + result + "<EOL>");
-    // System.out.println("Gold: " + gold + "<EOL>");
-    assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
+    assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
  }

  public void testEntities() throws Exception {
@ -106,9 +124,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
      builder.append((char)ch);
    }
    String result = builder.toString();
-    // System.out.println("Resu: " + result + "<EOL>");
-    // System.out.println("Gold: " + gold + "<EOL>");
-    assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
+    assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
  }

  public void testMoreEntities() throws Exception {
@ -123,9 +139,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
      builder.append((char)ch);
    }
    String result = builder.toString();
-    // System.out.println("Resu: " + result + "<EOL>");
-    // System.out.println("Gold: " + gold + "<EOL>");
-    assertTrue(result + " is not equal to " + gold, result.equals(gold) == true);
+    assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
  }

  public void testReserved() throws Exception {
@ -147,8 +161,176 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
  }

  public void testMalformedHTML() throws Exception {
-    String test = "a <a hr<ef=aa<a>> </close</a>";
-    String gold = "a <a hr<ef=aa > </close ";
+    String[] testGold = {
+        "a <a hr<ef=aa<a>> </close</a>",
+        "a <a hr<ef=aa> </close",
+
+        "<a href=http://dmoz.org/cgi-bin/add.cgi?where=/arts/\" class=lu style=\"font-size: 9px\" target=dmoz>Submit a Site</a>",
+        "Submit a Site",
+
+        "<a href=javascript:ioSwitch('p8','http://www.csmonitor.com/') title=expand id=e8 class=expanded rel=http://www.csmonitor.com/>Christian Science",
+        "Christian Science",
+
+        "<link rel=\"alternate\" type=\"application/rss+xml\" title=\"San Francisco \" 2008 RSS Feed\" href=\"http://2008.sf.wordcamp.org/feed/\" />",
+        "\n",
+
+        // "<" before ">" inhibits tag recognition
+        "<a href=\" http://www.surgery4was.happyhost.org/video-of-arthroscopic-knee-surgery symptoms.html, heat congestive heart failure <a href=\" http://www.symptoms1bad.happyhost.org/canine",
+        "<a href=\" http://www.surgery4was.happyhost.org/video-of-arthroscopic-knee-surgery symptoms.html, heat congestive heart failure <a href=\" http://www.symptoms1bad.happyhost.org/canine",
+
+        "<a href=\"http://ucblibraries.colorado.edu/how/index.htm\"class=\"pageNavAreaText\">",
+        "",
+
+        "<link title=\"^\\\" 21Sta's Blog\" rel=\"search\"  type=\"application/opensearchdescription+xml\"  href=\"http://21sta.com/blog/inc/opensearch.php\" />",
+        "\n",
+
+        "<a href=\"#postcomment\" title=\"\"Leave a comment\";\">?",
+        "?",
+
+        "<a href='/modern-furniture'   ' id='21txt' class='offtab'   onMouseout=\"this.className='offtab';  return true;\" onMouseover=\"this.className='ontab';  return true;\">",
+        "",
+
+        "<a href='http://alievi.wordpress.com/category/01-todos-posts/' style='font-size: 275%; padding: 1px; margin: 1px;' title='01 - Todos Post's (83)'>",
+        "",
+
+        "The <a href=<a href=\"http://www.advancedmd.com>medical\">http://www.advancedmd.com>medical</a> practice software</a>",
+        "The <a href=medical\">http://www.advancedmd.com>medical practice software",
+
+        "<a href=\"node/21426\" class=\"clipTitle2\" title=\"Levi.com/BMX 2008 Clip of the Week 29 \"Morgan Wade Leftover Clips\"\">Levi.com/BMX 2008 Clip of the Week 29...",
+        "Levi.com/BMX 2008 Clip of the Week 29...",
+
+        "<a href=\"printer_friendly.php?branch=&year=&submit=go&screen=\";\">Printer Friendly",
+        "Printer Friendly",
+
+        "<a href=#\" ondragstart=\"return false\" onclick=\"window.external.AddFavorite('http://www.amazingtextures.com', 'Amazing Textures');return false\" onmouseover=\"window.status='Add to Favorites';return true\">Add to Favorites",
+        "Add to Favorites",
+
+        "<a href=\"../at_home/at_home_search.html\"../_home/at_home_search.html\">At",
+        "At",
+
+        "E-mail: <a href=\"\"mailto:XXXXXX@example.com\" \">XXXXXX@example.com </a>",
+        "E-mail: XXXXXX@example.com ",
+
+        "<li class=\"farsi\"><a title=\"A'13?\" alt=\"A'13?\" href=\"http://www.america.gov/persian\" alt=\"\" name=\"A'13?\"A'13? title=\"A'13?\">A'13?</a></li>",
+        "\nA'13?\n",
+
+        "<li><a href=\"#28\" title=\"Hubert \"Geese\" Ausby\">Hubert \"Geese\" Ausby</a></li>",
+        "\nHubert \"Geese\" Ausby\n",
+
+        "<href=\"http://anbportal.com/mms/login.asp\">",
+        "\n",
+
+        "<a href=\"",
+        "<a href=\"",
+
+        "<a href=\">",
+        "",
+
+        "<a rel=\"nofollow\" href=\"http://anissanina31.skyrock.com/1895039493-Hi-tout-le-monde.html\" title=\" Hi, tout le monde !>#</a>",
+        "#",
+
+        "<a href=\"http://annunciharleydavidsonusate.myblog.it/\" title=\"Annunci Moto e Accessori Harley Davidson\" target=\"_blank\"><img src=\"http://annunciharleydavidsonusate.myblog.it/images/Antipixel.gif\" /></a>",
+        "",
+
+        "<a href=\"video/addvideo&v=120838887181\" onClick=\"return confirm('Are you sure you want  add this video to your profile? If it exists some video in your profile will be overlapped by this video!!')\" \" onmouseover=\"this.className='border2'\" onmouseout=\"this.className=''\">",
+        "",
+
+        "<a href=#Services & Support>",
+        "",
+
+        // "<" and ">" chars are accepted in on[Event] attribute values
+        "<input type=\"image\" src=\"http://apologyindex.com/ThemeFiles/83401-72905/images/btn_search.gif\"value=\"Search\" name=\"Search\" alt=\"Search\" class=\"searchimage\" onclick=\"incom ='&sc=' + document.getElementById('sel').value ; var dt ='&dt=' + document.getElementById('dt').value; var searchKeyword = document.getElementById('q').value ; searchKeyword = searchKeyword.replace(/\\s/g,''); if (searchKeyword.length < 3){alert('Nothing to search. Search keyword should contain atleast 3 chars.'); return false; } var al='&al=' +  document.getElementById('advancedlink').style.display ;  document.location.href='http://apologyindex.com/search.aspx?q=' + document.getElementById('q').value + incom + dt + al;\" />",
+        "",
+
+        "<input type=\"image\" src=\"images/afbe.gif\" width=\"22\" height=\"22\"  hspace=\"4\" title=\"Add to Favorite\" alt=\"Add to Favorite\"onClick=\" if(window.sidebar){ window.sidebar.addPanel(document.title,location.href,''); }else if(window.external){ window.external.AddFavorite(location.href,document.title); }else if(window.opera&&window.print) { return true; }\">",
+        "",
+
+        "<area shape=\"rect\" coords=\"12,153,115,305\" href=\"http://statenislandtalk.com/v-web/gallery/Osmundsen-family\"Art's Norwegian Roots in Rogaland\">",
+        "\n",
+
+        "<a rel=\"nofollow\" href=\"http://arth26.skyrock.com/660188240-bonzai.html\" title=\"bonza>#",
+        "#",
+
+        "<a href=  >",
+        "",
+
+        "<ahref=http:..",
+        "<ahref=http:..",
+
+        "<ahref=http:..>",
+        "\n",
+
+        "<ahref=\"http://aseigo.bddf.ca/cms/1025\">A",
+        "\nA",
+
+        "<a href=\"javascript:calendar_window=window.open('/calendar.aspx?formname=frmCalendar.txtDate','calendar_window','width=154,height=188');calendar_window.focus()\">",
+        "",
+
+        "<a href=\"/applications/defenseaerospace/19+rackmounts\" title=\"19\" Rackmounts\">",
+        "",
+
+        "<a href=http://www.azimprimerie.fr/flash/backup/lewes-zip-code/savage-model-110-manual.html title=savage model 110 manual rel=dofollow>",
+        "",
+
+        "<a class=\"at\" name=\"Lamborghini  href=\"http://lamborghini.coolbegin.com\">Lamborghini /a>",
+        "Lamborghini /a>",
+
+        "<A href='newslink.php?news_link=http%3A%2F%2Fwww.worldnetdaily.com%2Findex.php%3Ffa%3DPAGE.view%26pageId%3D85729&news_title=Florida QB makes 'John 3:16' hottest Google search Tebow inscribed Bible reference on eye black for championship game' TARGET=_blank>",
+        "",
+
+        "<a href=/myspace !style='color:#993333'>",
+        "",
+
+        "<meta name=3DProgId content=3DExcel.Sheet>",
+        "\n",
+
+        "<link id=3D\"shLink\" href=3D\"PSABrKelly-BADMINTONCupResults08FINAL2008_09_19=_files/sheet004.htm\">",
+        "\n",
+
+        "<td bgcolor=3D\"#FFFFFF\" nowrap>",
+        "\n",
+
+        "<a href=\"http://basnect.info/usersearch/\"predicciones-mundiales-2009\".html\">\"predicciones mundiales 2009\"</a>",
+        "\"predicciones mundiales 2009\"",
+
+        "<a class=\"comment-link\" href=\"https://www.blogger.com/comment.g?blogID=19402125&postID=114070605958684588\"location.href=https://www.blogger.com/comment.g?blogID=19402125&postID=114070605958684588;>",
+        "",
+
+        "<a href = \"/videos/Bishop\"/\" title = \"click to see more Bishop\" videos\">Bishop\"</a>",
+        "Bishop\"",
+
+        "<a href=\"http://bhaa.ie/calendar/event.php?eid=20081203150127531\"\">BHAA Eircom 2 &amp; 5 miles CC combined start</a>",
+        "BHAA Eircom 2 & 5 miles CC combined start",
+
+        "<a href=\"http://people.tribe.net/wolfmana\" onClick='setClick(\"Application[tribe].Person[bb7df210-9dc0-478c-917f-436b896bcb79]\")'\" title=\"Mana\">",
+        "",
+
+        "<a  href=\"http://blog.edu-cyberpg.com/ct.ashx?id=6143c528-080c-4bb2-b765-5ec56c8256d3&url=http%3a%2f%2fwww.gsa.ac.uk%2fmackintoshsketchbook%2f\"\" eudora=\"autourl\">",
+        "",
+
+        // "<" before ">" inhibits tag recognition
+        "<input type=\"text\" value=\"<search here>\">",
+        "<input type=\"text\" value=\"\n\">",
+
+        "<input type=\"text\" value=\"<search here\">",
+        "<input type=\"text\" value=\"\n",
+
+        "<input type=\"text\" value=\"search here>\">",
+        "\">",
+
+        // "<" and ">" chars are accepted in on[Event] attribute values
+        "<input type=\"text\" value=\"&lt;search here&gt;\" onFocus=\"this.value='<search here>'\">",
+        "",
+
+        "<![if ! IE]>\n<link href=\"http://i.deviantart.com/icons/favicon.png\" rel=\"shortcut icon\"/>\n<![endif]>",
+        "\n\n\n",
+
+        "<![if supportMisalignedColumns]>\n<tr height=0 style='display:none'>\n<td width=64 style='width:48pt'></td>\n</tr>\n<![endif]>",
+        "\n\n\n\n\n\n\n\n",
+    };
+    for (int i = 0 ; i < testGold.length ; i += 2) {
+      String test = testGold[i];
+      String gold = testGold[i + 1];
      Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
      StringBuilder builder = new StringBuilder();
      int ch = 0;
@ -156,36 +338,71 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
        builder.append((char)ch);
      }
      String result = builder.toString();
-    // System.out.println("Resu: " + result + "<EOL>");
-    // System.out.println("Gold: " + gold + "<EOL>");
-    assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
+      assertEquals("Test: '" + test + "'", gold, result);
+    }
  }

+
  public void testBufferOverflow() throws Exception {
-    StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.DEFAULT_READ_AHEAD + 50);
+    StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.getInitialBufferSize() + 50);
    testBuilder.append("ah<?> ??????");
-    appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
+    appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
    processBuffer(testBuilder.toString(), "Failed on pseudo proc. instr.");//processing instructions

    testBuilder.setLength(0);
    testBuilder.append("<!--");//comments
-    appendChars(testBuilder, 3*HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);//comments have two lookaheads
+    appendChars(testBuilder, 3 * HTMLStripCharFilter.getInitialBufferSize() + 500);//comments have two lookaheads

    testBuilder.append("-->foo");
-    processBuffer(testBuilder.toString(), "Failed w/ comment");
+    String gold = "foo";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());

    testBuilder.setLength(0);
    testBuilder.append("<?");
-    appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
+    appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
    testBuilder.append("?>");
-    processBuffer(testBuilder.toString(), "Failed with proc. instr.");
+    gold = "";
+    reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
+    ch = 0;
+    builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
    
    testBuilder.setLength(0);
    testBuilder.append("<b ");
-    appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
+    appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
    testBuilder.append("/>");
-    processBuffer(testBuilder.toString(), "Failed on tag");
-
+    gold = "";
+    reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
+    ch = 0;
+    builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
  }

  private void appendChars(StringBuilder testBuilder, int numChars) {
@ -208,7 +425,8 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
    } finally {
      // System.out.println("String (trimmed): " + builder.toString().trim() + "<EOS>");
    }
-    assertTrue(assertMsg + "::: " + builder.toString() + " is not equal to " + test, builder.toString().equals(test) == true);
+    assertEquals(assertMsg + "::: " + builder.toString() + " is not equal to " + test,
+        test, builder.toString());
  }

  public void testComment() throws Exception {
@ -225,7 +443,8 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
    } finally {
      // System.out.println("String: " + builder.toString());
    }
-    assertTrue(builder.toString() + " is not equal to " + gold + "<EOS>", builder.toString().equals(gold) == true);
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
  }


@ -247,7 +466,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
  }

  public void testOffsets() throws Exception {
-    doTestOffsets("hello X how X are you");
+//    doTestOffsets("hello X how X are you");
    doTestOffsets("hello <p> X<p> how <p>X are you");
    doTestOffsets("X &amp; X &#40; X &lt; &gt; X");

@ -255,7 +474,24 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
    doTestOffsets("X < &zz >X &# < X > < &l > &g < X");
  }

-  @Ignore("broken offsets: see LUCENE-2208")
+  static void assertLegalOffsets(String in) throws Exception {
+    int length = in.length();
+    HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(in))));
+    int ch = 0;
+    int off = 0;
+    while ((ch = reader.read()) != -1) {
+      int correction = reader.correctOffset(off);
+      assertTrue("invalid offset correction: " + off + "->" + correction + " for doc of length: " + length,
+          correction <= length);
+      off++;
+    }
+  }
+
+  public void testLegalOffsets() throws Exception {
+    assertLegalOffsets("hello world");
+    assertLegalOffsets("hello &#x world");
+  }
+
  public void testRandom() throws Exception {
    Analyzer analyzer = new Analyzer() {

@ -274,4 +510,311 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
    int numRounds = RANDOM_MULTIPLIER * 10000;
    checkRandomData(random, analyzer, numRounds);
  }
+  
+  public void testServerSideIncludes() throws Exception {
+    String test = "one<img src=\"image.png\"\n"
+        + " alt =  \"Alt: <!--#echo var='${IMAGE_CAPTION:<!--comment-->\\'Comment\\'}'  -->\"\n\n"
+        + " title=\"Title: <!--#echo var=\"IMAGE_CAPTION\"-->\">two";
+    String gold = "onetwo";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertTrue(builder.toString() + " is not equal to " + gold, builder.toString().equals(gold));
+
+    test = "one<script><!-- <!--#config comment=\"<!-- \\\"comment\\\"-->\"--> --></script>two";
+    gold = "one\ntwo";
+    reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    ch = 0;
+    builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+
+  public void testScriptQuotes() throws Exception {
+    String test = "one<script attr= bare><!-- action('<!-- comment -->', \"\\\"-->\\\"\"); --></script>two";
+    String gold = "one\ntwo";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+        gold, builder.toString());
+
+    test = "hello<script><!-- f('<!--internal--></script>'); --></script>";
+    gold = "hello\n";
+    reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    ch = 0;
+    builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+
+  public void testEscapeScript() throws Exception {
+    String test = "one<script no-value-attr>callSomeMethod();</script>two";
+    String gold = "one<script no-value-attr></script>two";
+    Set<String> escapedTags = new HashSet<String>(Arrays.asList("SCRIPT"));
+    Reader reader = new HTMLStripCharFilter
+        (CharReader.get(new StringReader(test)), escapedTags);
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+
+  public void testStyle() throws Exception {
+    String test = "one<style type=\"text/css\">\n"
+                + "<!--\n"
+                + "@import url('http://www.lasletrasdecanciones.com/css.css');\n"
+                + "-->\n"
+                + "</style>two";
+    String gold = "one\ntwo";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+        gold, builder.toString());
+  }
+
+  public void testEscapeStyle() throws Exception {
+    String test = "one<style type=\"text/css\"> body,font,a { font-family:arial; } </style>two";
+    String gold = "one<style type=\"text/css\"></style>two";
+    Set<String> escapedTags = new HashSet<String>(Arrays.asList("STYLE"));
+    Reader reader = new HTMLStripCharFilter
+        (CharReader.get(new StringReader(test)), escapedTags);
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+        gold, builder.toString());
+  }
+
+  public void testBR() throws Exception {
+    String[] testGold = {
+        "one<BR />two<br>three",
+        "one\ntwo\nthree",
+
+        "one<BR some stuff here too>two</BR>",
+        "one\ntwo\n",
+    };
+    for (int i = 0 ; i < testGold.length ; i += 2) {
+      String test = testGold[i];
+      String gold = testGold[i + 1];
+      Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+      StringBuilder builder = new StringBuilder();
+      int ch = 0;
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+      String result = builder.toString();
+      assertEquals("Test: '" + test + "'", gold, result);
+    }
+  }
+  public void testEscapeBR() throws Exception {
+    String test = "one<BR class='whatever'>two</\nBR\n>";
+    String gold = "one<BR class='whatever'>two</\nBR\n>";
+    Set<String> escapedTags = new HashSet<String>(Arrays.asList("BR"));
+    Reader reader = new HTMLStripCharFilter
+        (CharReader.get(new StringReader(test)), escapedTags);
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+  
+  public void testInlineTagsNoSpace() throws Exception {
+    String test = "one<sPAn class=\"invisible\">two<sup>2<sup>e</sup></sup>.</SpaN>three";
+    String gold = "onetwo2e.three";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+
+  public void testCDATA() throws Exception {
+    String test = "one<![CDATA[<one><two>three<four></four></two></one>]]>two";
+    String gold = "one<one><two>three<four></four></two></one>two";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+
+    test = "one<![CDATA[two<![CDATA[three]]]]><![CDATA[>four]]>five";
+    gold = "onetwo<![CDATA[three]]>fourfive";
+    reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    ch = 0;
+    builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+
+  public void testUppercaseCharacterEntityVariants() throws Exception {
+    String test = " &QUOT;-&COPY;&GT;>&LT;<&REG;&AMP;";
+    String gold = " \"-\u00A9>><<\u00AE&";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+  
+  public void testMSWordMalformedProcessingInstruction() throws Exception {
+    String test = "one<?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" />two";
+    String gold = "onetwo";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+
+  public void testSupplementaryCharsInTags() throws Exception {
+    String test = "one<𩬅艱鍟䇹愯瀛>two<瀛愯𩬅>three 瀛愯𩬅</瀛愯𩬅>four</𩬅艱鍟䇹愯瀛>five<𠀀𠀀>six<𠀀𠀀/>seven";
+    String gold = "one\ntwo\nthree 瀛愯𩬅\nfour\nfive\nsix\nseven";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+        gold, builder.toString());
+  }
+
+  public void testRandomBrokenHTML() throws Exception {
+    int maxNumElements = 10000;
+    String text = _TestUtil.randomHtmlishString(random, maxNumElements);
+    Reader reader = new HTMLStripCharFilter
+        (CharReader.get(new StringReader(text)));
+    while (reader.read() != -1);
+  }
+
+  public void testRandomText() throws Exception {
+    StringBuilder text = new StringBuilder();
+    int minNumWords = 10;
+    int maxNumWords = 10000;
+    int minWordLength = 3;
+    int maxWordLength = 20;
+    int numWords = _TestUtil.nextInt(random, minNumWords, maxNumWords);
+    switch (_TestUtil.nextInt(random, 0, 4)) {
+      case 0: {
+        for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
+          text.append(_TestUtil.randomUnicodeString(random, maxWordLength));
+          text.append(' ');
+        }
+        break;
+      }
+      case 1: {
+        for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
+          text.append(_TestUtil.randomRealisticUnicodeString
+              (random, minWordLength, maxWordLength));
+          text.append(' ');
+        }
+        break;
+      }
+      default: { // ASCII 50% of the time
+        for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
+          text.append(_TestUtil.randomSimpleString(random));
+          text.append(' ');
+        }
+      }
+    }
+    Reader reader = new HTMLStripCharFilter
+        (CharReader.get(new StringReader(text.toString())));
+    while (reader.read() != -1);
+  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/MS-Word
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/MS-Word
@ -0,0 +1,653 @@
+<html xmlns:v="urn:schemas-microsoft-com:vml"
+      xmlns:o="urn:schemas-microsoft-com:office:office"
+      xmlns:w="urn:schemas-microsoft-com:office:word"
+      xmlns:m="http://schemas.microsoft.com/office/2004/12/omml"
+      xmlns="http://www.w3.org/TR/REC-html40">
+
+<head>
+<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
+<meta name=ProgId content=Word.Document>
+<meta name=Generator content="Microsoft Word 14">
+<meta name=Originator content="Microsoft Word 14">
+<link rel=File-List href="This%20is%20a%20test_files/filelist.xml">
+<!--[if gte mso 9]><xml>
+  <o:DocumentProperties>
+    <o:Author>s</o:Author>
+    <o:LastAuthor>s</o:LastAuthor>
+    <o:Revision>1</o:Revision>
+    <o:TotalTime>1</o:TotalTime>
+    <o:Created>2012-01-13T03:36:00Z</o:Created>
+    <o:LastSaved>2012-01-13T03:37:00Z</o:LastSaved>
+    <o:Pages>1</o:Pages>
+    <o:Words>8</o:Words>
+    <o:Characters>48</o:Characters>
+    <o:Lines>1</o:Lines>
+    <o:Paragraphs>1</o:Paragraphs>
+    <o:CharactersWithSpaces>55</o:CharactersWithSpaces>
+    <o:Version>14.00</o:Version>
+  </o:DocumentProperties>
+  <o:OfficeDocumentSettings>
+    <o:AllowPNG/>
+  </o:OfficeDocumentSettings>
+</xml><![endif]-->
+<link rel=themeData href="This%20is%20a%20test_files/themedata.thmx">
+<link rel=colorSchemeMapping
+      href="This%20is%20a%20test_files/colorschememapping.xml">
+<!--[if gte mso 9]><xml>
+  <w:WordDocument>
+    <w:SpellingState>Clean</w:SpellingState>
+    <w:GrammarState>Clean</w:GrammarState>
+    <w:TrackMoves>false</w:TrackMoves>
+    <w:TrackFormatting/>
+    <w:PunctuationKerning/>
+    <w:ValidateAgainstSchemas/>
+    <w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid>
+    <w:IgnoreMixedContent>false</w:IgnoreMixedContent>
+    <w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText>
+    <w:DoNotPromoteQF/>
+    <w:LidThemeOther>EN-US</w:LidThemeOther>
+    <w:LidThemeAsian>X-NONE</w:LidThemeAsian>
+    <w:LidThemeComplexScript>X-NONE</w:LidThemeComplexScript>
+    <w:Compatibility>
+      <w:BreakWrappedTables/>
+      <w:SnapToGridInCell/>
+      <w:WrapTextWithPunct/>
+      <w:UseAsianBreakRules/>
+      <w:DontGrowAutofit/>
+      <w:SplitPgBreakAndParaMark/>
+      <w:EnableOpenTypeKerning/>
+      <w:DontFlipMirrorIndents/>
+      <w:OverrideTableStyleHps/>
+    </w:Compatibility>
+    <m:mathPr>
+      <m:mathFont m:val="Cambria Math"/>
+      <m:brkBin m:val="before"/>
+      <m:brkBinSub m:val="&#45;-"/>
+      <m:smallFrac m:val="off"/>
+      <m:dispDef/>
+      <m:lMargin m:val="0"/>
+      <m:rMargin m:val="0"/>
+      <m:defJc m:val="centerGroup"/>
+      <m:wrapIndent m:val="1440"/>
+      <m:intLim m:val="subSup"/>
+      <m:naryLim m:val="undOvr"/>
+    </m:mathPr></w:WordDocument>
+</xml><![endif]--><!--[if gte mso 9]><xml>
+<w:LatentStyles DefLockedState="false" DefUnhideWhenUsed="true"
+                DefSemiHidden="true" DefQFormat="false" DefPriority="99"
+                LatentStyleCount="267">
+<w:LsdException Locked="false" Priority="0" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Normal"/>
+<w:LsdException Locked="false" Priority="9" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="heading 1"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 2"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 3"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 4"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 5"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 6"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 7"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 8"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 9"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 1"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 2"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 3"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 4"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 5"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 6"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 7"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 8"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 9"/>
+<w:LsdException Locked="false" Priority="35" QFormat="true" Name="caption"/>
+<w:LsdException Locked="false" Priority="10" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Title"/>
+<w:LsdException Locked="false" Priority="1" Name="Default Paragraph Font"/>
+<w:LsdException Locked="false" Priority="11" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Subtitle"/>
+<w:LsdException Locked="false" Priority="22" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Strong"/>
+<w:LsdException Locked="false" Priority="20" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Emphasis"/>
+<w:LsdException Locked="false" Priority="59" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Table Grid"/>
+<w:LsdException Locked="false" UnhideWhenUsed="false" Name="Placeholder Text"/>
+<w:LsdException Locked="false" Priority="1" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="No Spacing"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Shading"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light List"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Grid"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 1"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 2"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 1"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 2"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 1"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 2"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 3"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Dark List"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Shading"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful List"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Grid"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Shading Accent 1"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light List Accent 1"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Grid Accent 1"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 1 Accent 1"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 2 Accent 1"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 1 Accent 1"/>
+<w:LsdException Locked="false" UnhideWhenUsed="false" Name="Revision"/>
+<w:LsdException Locked="false" Priority="34" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="List Paragraph"/>
+<w:LsdException Locked="false" Priority="29" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Quote"/>
+<w:LsdException Locked="false" Priority="30" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Intense Quote"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 2 Accent 1"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 1 Accent 1"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 2 Accent 1"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 3 Accent 1"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Dark List Accent 1"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Shading Accent 1"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful List Accent 1"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Grid Accent 1"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Shading Accent 2"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light List Accent 2"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Grid Accent 2"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 1 Accent 2"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 2 Accent 2"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 1 Accent 2"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 2 Accent 2"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 1 Accent 2"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 2 Accent 2"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 3 Accent 2"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Dark List Accent 2"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Shading Accent 2"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful List Accent 2"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Grid Accent 2"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Shading Accent 3"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light List Accent 3"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Grid Accent 3"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 1 Accent 3"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 2 Accent 3"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 1 Accent 3"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 2 Accent 3"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 1 Accent 3"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 2 Accent 3"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 3 Accent 3"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Dark List Accent 3"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Shading Accent 3"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful List Accent 3"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Grid Accent 3"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Shading Accent 4"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light List Accent 4"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Grid Accent 4"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 1 Accent 4"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 2 Accent 4"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 1 Accent 4"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 2 Accent 4"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 1 Accent 4"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 2 Accent 4"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 3 Accent 4"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Dark List Accent 4"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Shading Accent 4"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful List Accent 4"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Grid Accent 4"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Shading Accent 5"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light List Accent 5"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Grid Accent 5"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 1 Accent 5"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 2 Accent 5"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 1 Accent 5"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 2 Accent 5"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 1 Accent 5"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 2 Accent 5"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 3 Accent 5"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Dark List Accent 5"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Shading Accent 5"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful List Accent 5"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Grid Accent 5"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Shading Accent 6"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light List Accent 6"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Grid Accent 6"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 1 Accent 6"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 2 Accent 6"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 1 Accent 6"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 2 Accent 6"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 1 Accent 6"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 2 Accent 6"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 3 Accent 6"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Dark List Accent 6"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Shading Accent 6"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful List Accent 6"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Grid Accent 6"/>
+<w:LsdException Locked="false" Priority="19" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Subtle Emphasis"/>
+<w:LsdException Locked="false" Priority="21" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Intense Emphasis"/>
+<w:LsdException Locked="false" Priority="31" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Subtle Reference"/>
+<w:LsdException Locked="false" Priority="32" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Intense Reference"/>
+<w:LsdException Locked="false" Priority="33" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Book Title"/>
+<w:LsdException Locked="false" Priority="37" Name="Bibliography"/>
+<w:LsdException Locked="false" Priority="39" QFormat="true" Name="TOC Heading"/>
+</w:LatentStyles>
+</xml><![endif]-->
+<style>
+<!--
+  /* Font Definitions */
+@font-face
+{font-family:"Cambria Math";
+  panose-1:2 4 5 3 5 4 6 3 2 4;
+  mso-font-charset:1;
+  mso-generic-font-family:roman;
+  mso-font-format:other;
+  mso-font-pitch:variable;
+  mso-font-signature:0 0 0 0 0 0;}
+@font-face
+{font-family:Cambria;
+  panose-1:2 4 5 3 5 4 6 3 2 4;
+  mso-font-charset:0;
+  mso-generic-font-family:roman;
+  mso-font-pitch:variable;
+  mso-font-signature:-536870145 1073743103 0 0 415 0;}
+@font-face
+{font-family:Calibri;
+  panose-1:2 15 5 2 2 2 4 3 2 4;
+  mso-font-charset:0;
+  mso-generic-font-family:swiss;
+  mso-font-pitch:variable;
+  mso-font-signature:-520092929 1073786111 9 0 415 0;}
+  /* Style Definitions */
+p.MsoNormal, li.MsoNormal, div.MsoNormal
+{mso-style-unhide:no;
+  mso-style-qformat:yes;
+  mso-style-parent:"";
+  margin-top:0in;
+  margin-right:0in;
+  margin-bottom:10.0pt;
+  margin-left:0in;
+  line-height:115%;
+  mso-pagination:widow-orphan;
+  font-size:11.0pt;
+  font-family:"Calibri","sans-serif";
+  mso-ascii-font-family:Calibri;
+  mso-ascii-theme-font:minor-latin;
+  mso-fareast-font-family:Calibri;
+  mso-fareast-theme-font:minor-latin;
+  mso-hansi-font-family:Calibri;
+  mso-hansi-theme-font:minor-latin;
+  mso-bidi-font-family:"Times New Roman";
+  mso-bidi-theme-font:minor-bidi;}
+h1
+{mso-style-priority:9;
+  mso-style-unhide:no;
+  mso-style-qformat:yes;
+  mso-style-link:"Heading 1 Char";
+  mso-style-next:Normal;
+  margin-top:24.0pt;
+  margin-right:0in;
+  margin-bottom:0in;
+  margin-left:0in;
+  margin-bottom:.0001pt;
+  line-height:115%;
+  mso-pagination:widow-orphan lines-together;
+  page-break-after:avoid;
+  mso-outline-level:1;
+  font-size:14.0pt;
+  font-family:"Cambria","serif";
+  mso-ascii-font-family:Cambria;
+  mso-ascii-theme-font:major-latin;
+  mso-fareast-font-family:"Times New Roman";
+  mso-fareast-theme-font:major-fareast;
+  mso-hansi-font-family:Cambria;
+  mso-hansi-theme-font:major-latin;
+  mso-bidi-font-family:"Times New Roman";
+  mso-bidi-theme-font:major-bidi;
+  color:#365F91;
+  mso-themecolor:accent1;
+  mso-themeshade:191;
+  mso-font-kerning:0pt;}
+p.MsoTitle, li.MsoTitle, div.MsoTitle
+{mso-style-priority:10;
+  mso-style-unhide:no;
+  mso-style-qformat:yes;
+  mso-style-link:"Title Char";
+  mso-style-next:Normal;
+  margin-top:0in;
+  margin-right:0in;
+  margin-bottom:15.0pt;
+  margin-left:0in;
+  mso-add-space:auto;
+  mso-pagination:widow-orphan;
+  border:none;
+  mso-border-bottom-alt:solid #4F81BD 1.0pt;
+  mso-border-bottom-themecolor:accent1;
+  padding:0in;
+  mso-padding-alt:0in 0in 4.0pt 0in;
+  font-size:26.0pt;
+  font-family:"Cambria","serif";
+  mso-ascii-font-family:Cambria;
+  mso-ascii-theme-font:major-latin;
+  mso-fareast-font-family:"Times New Roman";
+  mso-fareast-theme-font:major-fareast;
+  mso-hansi-font-family:Cambria;
+  mso-hansi-theme-font:major-latin;
+  mso-bidi-font-family:"Times New Roman";
+  mso-bidi-theme-font:major-bidi;
+  color:#17365D;
+  mso-themecolor:text2;
+  mso-themeshade:191;
+  letter-spacing:.25pt;
+  mso-font-kerning:14.0pt;}
+p.MsoTitleCxSpFirst, li.MsoTitleCxSpFirst, div.MsoTitleCxSpFirst
+{mso-style-priority:10;
+  mso-style-unhide:no;
+  mso-style-qformat:yes;
+  mso-style-link:"Title Char";
+  mso-style-next:Normal;
+  mso-style-type:export-only;
+  margin:0in;
+  margin-bottom:.0001pt;
+  mso-add-space:auto;
+  mso-pagination:widow-orphan;
+  border:none;
+  mso-border-bottom-alt:solid #4F81BD 1.0pt;
+  mso-border-bottom-themecolor:accent1;
+  padding:0in;
+  mso-padding-alt:0in 0in 4.0pt 0in;
+  font-size:26.0pt;
+  font-family:"Cambria","serif";
+  mso-ascii-font-family:Cambria;
+  mso-ascii-theme-font:major-latin;
+  mso-fareast-font-family:"Times New Roman";
+  mso-fareast-theme-font:major-fareast;
+  mso-hansi-font-family:Cambria;
+  mso-hansi-theme-font:major-latin;
+  mso-bidi-font-family:"Times New Roman";
+  mso-bidi-theme-font:major-bidi;
+  color:#17365D;
+  mso-themecolor:text2;
+  mso-themeshade:191;
+  letter-spacing:.25pt;
+  mso-font-kerning:14.0pt;}
+p.MsoTitleCxSpMiddle, li.MsoTitleCxSpMiddle, div.MsoTitleCxSpMiddle
+{mso-style-priority:10;
+  mso-style-unhide:no;
+  mso-style-qformat:yes;
+  mso-style-link:"Title Char";
+  mso-style-next:Normal;
+  mso-style-type:export-only;
+  margin:0in;
+  margin-bottom:.0001pt;
+  mso-add-space:auto;
+  mso-pagination:widow-orphan;
+  border:none;
+  mso-border-bottom-alt:solid #4F81BD 1.0pt;
+  mso-border-bottom-themecolor:accent1;
+  padding:0in;
+  mso-padding-alt:0in 0in 4.0pt 0in;
+  font-size:26.0pt;
+  font-family:"Cambria","serif";
+  mso-ascii-font-family:Cambria;
+  mso-ascii-theme-font:major-latin;
+  mso-fareast-font-family:"Times New Roman";
+  mso-fareast-theme-font:major-fareast;
+  mso-hansi-font-family:Cambria;
+  mso-hansi-theme-font:major-latin;
+  mso-bidi-font-family:"Times New Roman";
+  mso-bidi-theme-font:major-bidi;
+  color:#17365D;
+  mso-themecolor:text2;
+  mso-themeshade:191;
+  letter-spacing:.25pt;
+  mso-font-kerning:14.0pt;}
+p.MsoTitleCxSpLast, li.MsoTitleCxSpLast, div.MsoTitleCxSpLast
+{mso-style-priority:10;
+  mso-style-unhide:no;
+  mso-style-qformat:yes;
+  mso-style-link:"Title Char";
+  mso-style-next:Normal;
+  mso-style-type:export-only;
+  margin-top:0in;
+  margin-right:0in;
+  margin-bottom:15.0pt;
+  margin-left:0in;
+  mso-add-space:auto;
+  mso-pagination:widow-orphan;
+  border:none;
+  mso-border-bottom-alt:solid #4F81BD 1.0pt;
+  mso-border-bottom-themecolor:accent1;
+  padding:0in;
+  mso-padding-alt:0in 0in 4.0pt 0in;
+  font-size:26.0pt;
+  font-family:"Cambria","serif";
+  mso-ascii-font-family:Cambria;
+  mso-ascii-theme-font:major-latin;
+  mso-fareast-font-family:"Times New Roman";
+  mso-fareast-theme-font:major-fareast;
+  mso-hansi-font-family:Cambria;
+  mso-hansi-theme-font:major-latin;
+  mso-bidi-font-family:"Times New Roman";
+  mso-bidi-theme-font:major-bidi;
+  color:#17365D;
+  mso-themecolor:text2;
+  mso-themeshade:191;
+  letter-spacing:.25pt;
+  mso-font-kerning:14.0pt;}
+span.TitleChar
+{mso-style-name:"Title Char";
+  mso-style-priority:10;
+  mso-style-unhide:no;
+  mso-style-locked:yes;
+  mso-style-link:Title;
+  mso-ansi-font-size:26.0pt;
+  mso-bidi-font-size:26.0pt;
+  font-family:"Cambria","serif";
+  mso-ascii-font-family:Cambria;
+  mso-ascii-theme-font:major-latin;
+  mso-fareast-font-family:"Times New Roman";
+  mso-fareast-theme-font:major-fareast;
+  mso-hansi-font-family:Cambria;
+  mso-hansi-theme-font:major-latin;
+  mso-bidi-font-family:"Times New Roman";
+  mso-bidi-theme-font:major-bidi;
+  color:#17365D;
+  mso-themecolor:text2;
+  mso-themeshade:191;
+  letter-spacing:.25pt;
+  mso-font-kerning:14.0pt;}
+span.Heading1Char
+{mso-style-name:"Heading 1 Char";
+  mso-style-priority:9;
+  mso-style-unhide:no;
+  mso-style-locked:yes;
+  mso-style-link:"Heading 1";
+  mso-ansi-font-size:14.0pt;
+  mso-bidi-font-size:14.0pt;
+  font-family:"Cambria","serif";
+  mso-ascii-font-family:Cambria;
+  mso-ascii-theme-font:major-latin;
+  mso-fareast-font-family:"Times New Roman";
+  mso-fareast-theme-font:major-fareast;
+  mso-hansi-font-family:Cambria;
+  mso-hansi-theme-font:major-latin;
+  mso-bidi-font-family:"Times New Roman";
+  mso-bidi-theme-font:major-bidi;
+  color:#365F91;
+  mso-themecolor:accent1;
+  mso-themeshade:191;
+  font-weight:bold;}
+.MsoChpDefault
+{mso-style-type:export-only;
+  mso-default-props:yes;
+  font-family:"Calibri","sans-serif";
+  mso-ascii-font-family:Calibri;
+  mso-ascii-theme-font:minor-latin;
+  mso-fareast-font-family:Calibri;
+  mso-fareast-theme-font:minor-latin;
+  mso-hansi-font-family:Calibri;
+  mso-hansi-theme-font:minor-latin;
+  mso-bidi-font-family:"Times New Roman";
+  mso-bidi-theme-font:minor-bidi;}
+.MsoPapDefault
+{mso-style-type:export-only;
+  margin-bottom:10.0pt;
+  line-height:115%;}
+@page WordSection1
+{size:8.5in 11.0in;
+  margin:1.0in 1.0in 1.0in 1.0in;
+  mso-header-margin:.5in;
+  mso-footer-margin:.5in;
+  mso-paper-source:0;}
+div.WordSection1
+{page:WordSection1;}
+-->
+</style>
+<!--[if gte mso 10]>
+<style>
+    /* Style Definitions */
+  table.MsoNormalTable
+  {mso-style-name:"Table Normal";
+    mso-tstyle-rowband-size:0;
+    mso-tstyle-colband-size:0;
+    mso-style-noshow:yes;
+    mso-style-priority:99;
+    mso-style-parent:"";
+    mso-padding-alt:0in 5.4pt 0in 5.4pt;
+    mso-para-margin-top:0in;
+    mso-para-margin-right:0in;
+    mso-para-margin-bottom:10.0pt;
+    mso-para-margin-left:0in;
+    line-height:115%;
+    mso-pagination:widow-orphan;
+    font-size:11.0pt;
+    font-family:"Calibri","sans-serif";
+    mso-ascii-font-family:Calibri;
+    mso-ascii-theme-font:minor-latin;
+    mso-hansi-font-family:Calibri;
+    mso-hansi-theme-font:minor-latin;
+    mso-bidi-font-family:"Times New Roman";
+    mso-bidi-theme-font:minor-bidi;}
+</style>
+<![endif]--><!--[if gte mso 9]><xml>
+  <o:shapedefaults v:ext="edit" spidmax="1026"/>
+</xml><![endif]--><!--[if gte mso 9]><xml>
+  <o:shapelayout v:ext="edit">
+    <o:idmap v:ext="edit" data="1"/>
+  </o:shapelayout></xml><![endif]-->
+</head>
+
+<body lang=EN-US style='tab-interval:.5in'>
+
+<div class=WordSection1>
+
+  <div style='mso-element:para-border-div;border:none;border-bottom:solid #4F81BD 1.0pt;
+mso-border-bottom-themecolor:accent1;padding:0in 0in 4.0pt 0in'>
+
+    <p class=MsoTitle>This is a test</p>
+
+  </div>
+
+</div>
+
+</body>
+
+</html>
+
--- a/modules/analysis/icu/build.xml
+++ b/modules/analysis/icu/build.xml
@ -113,6 +113,23 @@ are part of the ICU4C package. See http://site.icu-project.org/ </echo>
    </java>
  </target>

+  <property name="html.strip.charfilter.supp.macros.output.file"
+            location="../common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro"/>
+
+  <target name="gen-html-strip-charfilter-supp-macros" depends="compile-tools">
+    <java
+        classname="org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros"
+        dir="."
+        fork="true"
+        failonerror="true"
+        output="${html.strip.charfilter.supp.macros.output.file}">
+      <classpath>
+        <path refid="additional.dependencies"/>
+        <pathelement location="${build.dir}/classes/tools"/>
+      </classpath>
+    </java>
+  </target>
+
  <target name="compile-tools" depends="common.compile-tools">
    <compile
      srcdir="src/tools/java"
--- a/modules/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateHTMLStripCharFilterSupplementaryMacros.java
+++ b/modules/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateHTMLStripCharFilterSupplementaryMacros.java
@ -0,0 +1,110 @@
+package org.apache.lucene.analysis.icu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.DateFormat;
+import java.util.*;
+
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.text.UnicodeSetIterator;
+import com.ibm.icu.util.VersionInfo;
+
+/** creates a macro to augment jflex's unicode support for > BMP */
+public class GenerateHTMLStripCharFilterSupplementaryMacros {
+  private static final UnicodeSet BMP = new UnicodeSet("[\u0000-\uFFFF]");
+  private static final String NL = System.getProperty("line.separator");
+  private static final DateFormat DATE_FORMAT = DateFormat.getDateTimeInstance
+      (DateFormat.FULL, DateFormat.FULL, Locale.US);
+  static {
+    DATE_FORMAT.setTimeZone(TimeZone.getTimeZone("UTC"));
+  }
+
+  private static final String APACHE_LICENSE
+      = "/*" + NL
+      + " * Copyright 2010 The Apache Software Foundation." + NL
+      + " *" + NL
+      + " * Licensed under the Apache License, Version 2.0 (the \"License\");" + NL
+      + " * you may not use this file except in compliance with the License." + NL
+      + " * You may obtain a copy of the License at" + NL
+      + " *" + NL
+      + " *      http://www.apache.org/licenses/LICENSE-2.0" + NL
+      + " *" + NL
+      + " * Unless required by applicable law or agreed to in writing, software" + NL
+      + " * distributed under the License is distributed on an \"AS IS\" BASIS," + NL
+      + " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
+      + " * See the License for the specific language governing permissions and" + NL
+      + " * limitations under the License." + NL
+      + " */" + NL + NL;
+
+
+  public static void main(String args[]) throws Exception {
+    outputHeader();
+    outputMacro("ID_Start_Supp", "[:ID_Start:]");
+    outputMacro("ID_Continue_Supp", "[:ID_Continue:]");
+  }
+
+  static void outputHeader() {
+    System.out.print(APACHE_LICENSE);
+    System.out.print("// Generated using ICU4J " + VersionInfo.ICU_VERSION.toString() + " on ");
+    System.out.println(DATE_FORMAT.format(new Date()));
+    System.out.println("// by " + GenerateHTMLStripCharFilterSupplementaryMacros.class.getName());
+    System.out.print(NL + NL);
+  }
+
+  // we have to carefully output the possibilities as compact utf-16
+  // range expressions, or jflex will OOM!
+  static void outputMacro(String name, String pattern) {
+    UnicodeSet set = new UnicodeSet(pattern);
+    set.removeAll(BMP);
+    System.out.println(name + " = (");
+    // if the set is empty, we have to do this or jflex will barf
+    if (set.isEmpty()) {
+      System.out.println("\t  []");
+    }
+
+    HashMap<Character,UnicodeSet> utf16ByLead = new HashMap<Character,UnicodeSet>();
+    for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) {
+      char utf16[] = Character.toChars(it.codepoint);
+      UnicodeSet trails = utf16ByLead.get(utf16[0]);
+      if (trails == null) {
+        trails = new UnicodeSet();
+        utf16ByLead.put(utf16[0], trails);
+      }
+      trails.add(utf16[1]);
+    }
+    
+    Map<String,UnicodeSet> utf16ByTrail = new HashMap<String,UnicodeSet>();
+    for (Map.Entry<Character,UnicodeSet> entry : utf16ByLead.entrySet()) {
+      String trail = entry.getValue().getRegexEquivalent();
+      UnicodeSet leads = utf16ByTrail.get(trail);
+      if (leads == null) {
+        leads = new UnicodeSet();
+        utf16ByTrail.put(trail, leads);
+      }
+      leads.add(entry.getKey());
+    }
+
+    boolean isFirst = true;
+    for (Map.Entry<String,UnicodeSet> entry : utf16ByTrail.entrySet()) {
+      System.out.print( isFirst ? "\t  " : "\t| ");
+      isFirst = false;
+      System.out.println(entry.getValue().getRegexEquivalent() + entry.getKey());
+    }
+    System.out.println(")");
+  }
+}
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -401,6 +401,14 @@ Upgrading from Solr 3.5
 * As doGet() methods in SimplePostTool was changed to static, the client applications of this
  class need to be recompiled.

+* In Solr version 3.5 and earlier, HTMLStripCharFilter had known bugs in the
+  character offsets it provided, triggering e.g. exceptions in highlighting.
+  HTMLStripCharFilter has been re-implemented, addressing this and other
+  issues.  See the entry for LUCENE-3690 in the Bug Fixes section below for a
+  detailed list of changes.  For people who depend on the behavior of
+  HTMLStripCharFilter in Solr version 3.5 and earlier: the old implementation
+  (bugs and all) is preserved as LegacyHTMLStripCharFilter.
+
 New Features
 ----------------------
 * SOLR-2904: BinaryUpdateRequestHandler should be able to accept multiple update requests from
@ -483,6 +491,41 @@ Bug Fixes

 * SOLR-2970: CSV ResponseWriter returns fields defined as stored=false in schema (janhoy)

+* LUCENE-3690, LUCENE-2208, SOLR-882, SOLR-42: Re-implemented
+  HTMLStripCharFilter as a JFlex-generated scanner.  See below for a list
+  of bug fixes and other changes.  To get the same behavior as
+  HTMLStripCharFilter in Solr version 3.5 and earlier (including the bugs),
+  use LegacyHTMLStripCharFilter, which is the previous implementation.
+
+  Behavior changes from the previous version:
+
+  - Known offset bugs are fixed.
+  - The "Mark invalid" exceptions reported in SOLR-1283 are no longer
+    triggered (the bug is still present in LegacyHTMLStripCharFilter).
+  - The character entity "&apos;" is now always properly decoded.
+  - More cases of <script> tags are now properly stripped.
+  - CDATA sections are now handled properly.
+  - Valid tag name characters now include the supplementary Unicode characters
+    from Unicode character classes [:ID_Start:] and [:ID_Continue:].
+  - Uppercase character entities "&QUOT;", "&COPY;", "&GT;", "&LT;", "&REG;",
+    and "&AMP;" are now recognized and handled as if they were in lowercase.
+  - Opening tags with unbalanced quotation marks are now properly stripped.
+  - Literal "<" and ">" characters in opening tags, regardless of whether they
+    appear inside quotation marks, now inhibit recognition (and stripping) of
+    the tags.  The only exception to this is for values of event-handler
+    attributes, e.g. "onClick", "onLoad", "onSelect".
+  - A newline '\n' is substituted instead of a space for stripped HTML markup.
+  - Nothing is substituted for opening and closing inline tags - they are
+    simply removed.  The list of inline tags is (case insensitively): <a>,
+    <abbr>, <acronym>, <b>, <basefont>, <bdo>, <big>, <cite>, <code>, <dfn>,
+    <em>, <font>, <i>, <img>, <input>, <kbd>, <label>, <q>, <s>, <samp>,
+    <select>, <small>, <span>, <strike>, <strong>, <sub>, <sup>, <textarea>,
+    <tt>, <u>, and <var>.
+  - HTMLStripCharFilterFactory now handles HTMLStripCharFilter's "escapedTags"
+    feature: opening and closing tags with the given names, including any
+    attributes and their values, are left intact in the output.
+  (Steve Rowe)
+
 Other Changes
 ----------------------
 * SOLR-2922: Upgrade commons-io and commons-lang to 2.1 and 2.6, respectively. (koji)
--- a/solr/core/src/java/org/apache/solr/analysis/HTMLStripCharFilterFactory.java
+++ b/solr/core/src/java/org/apache/solr/analysis/HTMLStripCharFilterFactory.java
@ -21,12 +21,18 @@ package org.apache.solr.analysis;
 import org.apache.lucene.analysis.CharStream;
 import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;

+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
 /**
 * Factory for {@link HTMLStripCharFilter}. 
 * <pre class="prettyprint" >
 * &lt;fieldType name="text_html" class="solr.TextField" positionIncrementGap="100"&gt;
 *   &lt;analyzer&gt;
- *     &lt;charFilter class="solr.HTMLStripCharFilterFactory"/&gt;
+ *     &lt;charFilter class="solr.HTMLStripCharFilterFactory" escapedTags="a, title" /&gt;
 *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
 *   &lt;/analyzer&gt;
 * &lt;/fieldType&gt;</pre
@ -34,8 +40,31 @@ import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
 */
 public class HTMLStripCharFilterFactory extends BaseCharFilterFactory {
  
+  Set<String> escapedTags = null;
+  Pattern TAG_NAME_PATTERN = Pattern.compile("[^\\s,]+");
+
  public HTMLStripCharFilter create(CharStream input) {
-    return new HTMLStripCharFilter(input);
+    HTMLStripCharFilter charFilter;
+    if (null == escapedTags) {
+      charFilter = new HTMLStripCharFilter(input);
+    } else {
+      charFilter = new HTMLStripCharFilter(input, escapedTags);
+    }
+    return charFilter;
  }
  
+  @Override
+  public void init(Map<String,String> args) {
+    super.init(args);
+    String escapedTagsArg = args.get("escapedTags");
+    if (null != escapedTagsArg) {
+      Matcher matcher = TAG_NAME_PATTERN.matcher(escapedTagsArg);
+      while (matcher.find()) {
+        if (null == escapedTags) {
+          escapedTags = new HashSet<String>();
+        }
+        escapedTags.add(matcher.group(0));
+      }
+    }
+  }
 }
--- a/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilter.java
+++ b/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilter.java
--- a/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilterFactory.java
+++ b/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilterFactory.java
@ -0,0 +1,58 @@
+package org.apache.solr.analysis;
+
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.CharStream;
+
+/**
+ * Factory for {@link LegacyHTMLStripCharFilter}.
+ * <pre class="prettyprint" >
+ * &lt;fieldType name="text_html_legacy" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;charFilter class="solr.LegacyHTMLStripCharFilterFactory"/&gt;
+ *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;
+ * </pre>
+ * <p>
+ * This factory is <b>NOT</b> recommended for new users and should be
+ * considered <b>UNSUPPORTED</b>.
+ * </p>
+ * <p>
+ * In Solr version 3.5 and earlier, <tt>HTMLStripCharFilter(Factory)</tt>
+ * had known bugs in the offsets it provided, triggering e.g. exceptions in
+ * highlighting.
+ * </p>
+ * <p>
+ * This class is provided as possible alternative for people who depend on
+ * the "broken" behavior of <tt>HTMLStripCharFilter</tt> in Solr version 3.5
+ * and earlier, and/or who don't like the changes introduced by the Solr 3.6+
+ * version of <tt>HTMLStripCharFilterFactory</tt>.  (See the 3.6.0 release
+ * section of lucene/CHANGES.txt for a list of differences in behavior.)
+ * </p>
+ * @deprecated use {@link HTMLStripCharFilterFactory}
+ */
+@Deprecated
+public class LegacyHTMLStripCharFilterFactory extends BaseCharFilterFactory {
+
+  public LegacyHTMLStripCharFilter create(CharStream input) {
+    return new LegacyHTMLStripCharFilter(input);
+  }
+
+}
--- a/solr/core/src/test/org/apache/solr/analysis/LegacyHTMLStripCharFilterTest.java
+++ b/solr/core/src/test/org/apache/solr/analysis/LegacyHTMLStripCharFilterTest.java
@ -0,0 +1,321 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharReader;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.util._TestUtil;
+import org.junit.Ignore;
+
+public class LegacyHTMLStripCharFilterTest extends BaseTokenStreamTestCase {
+
+  //this is some text  here is a  link  and another  link . This is an entity: & plus a <.  Here is an &
+  //
+  public void test() throws IOException {
+    String html = "<div class=\"foo\">this is some text</div> here is a <a href=\"#bar\">link</a> and " +
+            "another <a href=\"http://lucene.apache.org/\">link</a>. " +
+            "This is an entity: &amp; plus a &lt;.  Here is an &. <!-- is a comment -->";
+    String gold = " this is some text  here is a  link  and " +
+            "another  link . " +
+            "This is an entity: & plus a <.  Here is an &.  ";
+    LegacyHTMLStripCharFilter reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(html)));
+    StringBuilder builder = new StringBuilder();
+    int ch = -1;
+    char [] goldArray = gold.toCharArray();
+    int position = 0;
+    while ((ch = reader.read()) != -1){
+      char theChar = (char) ch;
+      builder.append(theChar);
+      assertTrue("\"" + theChar + "\"" + " at position: " + position + " does not equal: " + goldArray[position]
+              + " Buffer so far: " + builder + "<EOB>", theChar == goldArray[position]);
+      position++;
+    }
+    assertEquals(gold, builder.toString());
+  }
+
+  //Some sanity checks, but not a full-fledged check
+  public void testHTML() throws Exception {
+    InputStream stream = getClass().getResourceAsStream("htmlStripReaderTest.html");
+    LegacyHTMLStripCharFilter reader = new LegacyHTMLStripCharFilter(CharReader.get(new InputStreamReader(stream, "UTF-8")));
+    StringBuilder builder = new StringBuilder();
+    int ch = -1;
+    while ((ch = reader.read()) != -1){
+      builder.append((char)ch);
+    }
+    String str = builder.toString();
+    assertTrue("Entity not properly escaped", str.indexOf("&lt;") == -1);//there is one > in the text
+    assertTrue("Forrest should have been stripped out", str.indexOf("forrest") == -1 && str.indexOf("Forrest") == -1);
+    assertTrue("File should start with 'Welcome to Solr' after trimming", str.trim().startsWith("Welcome to Solr"));
+
+    assertTrue("File should start with 'Foundation.' after trimming", str.trim().endsWith("Foundation."));
+    
+  }
+
+  public void testGamma() throws Exception {
+    String test = "&Gamma;";
+    String gold = "\u0393";
+    Set<String> set = new HashSet<String>();
+    set.add("reserved");
+    Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
+    StringBuilder builder = new StringBuilder();
+    int ch = 0;
+    while ((ch = reader.read()) != -1){
+      builder.append((char)ch);
+    }
+    String result = builder.toString();
+    // System.out.println("Resu: " + result + "<EOL>");
+    // System.out.println("Gold: " + gold + "<EOL>");
+    assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
+  }
+
+  public void testEntities() throws Exception {
+    String test = "&nbsp; &lt;foo&gt; &Uuml;bermensch &#61; &Gamma; bar &#x393;";
+    String gold = "  <foo> \u00DCbermensch = \u0393 bar \u0393";
+    Set<String> set = new HashSet<String>();
+    set.add("reserved");
+    Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
+    StringBuilder builder = new StringBuilder();
+    int ch = 0;
+    while ((ch = reader.read()) != -1){
+      builder.append((char)ch);
+    }
+    String result = builder.toString();
+    // System.out.println("Resu: " + result + "<EOL>");
+    // System.out.println("Gold: " + gold + "<EOL>");
+    assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
+  }
+
+  public void testMoreEntities() throws Exception {
+    String test = "&nbsp; &lt;junk/&gt; &nbsp; &#33; &#64; and &#8217;";
+    String gold = "  <junk/>   ! @ and ’";
+    Set<String> set = new HashSet<String>();
+    set.add("reserved");
+    Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
+    StringBuilder builder = new StringBuilder();
+    int ch = 0;
+    while ((ch = reader.read()) != -1){
+      builder.append((char)ch);
+    }
+    String result = builder.toString();
+    // System.out.println("Resu: " + result + "<EOL>");
+    // System.out.println("Gold: " + gold + "<EOL>");
+    assertTrue(result + " is not equal to " + gold, result.equals(gold) == true);
+  }
+
+  public void testReserved() throws Exception {
+    String test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>";
+    Set<String> set = new HashSet<String>();
+    set.add("reserved");
+    Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
+    StringBuilder builder = new StringBuilder();
+    int ch = 0;
+    while ((ch = reader.read()) != -1){
+      builder.append((char)ch);
+    }
+    String result = builder.toString();
+    // System.out.println("Result: " + result);
+    assertTrue("Escaped tag not preserved: "  + result.indexOf("reserved"), result.indexOf("reserved") == 9);
+    assertTrue("Escaped tag not preserved: " + result.indexOf("reserved", 15), result.indexOf("reserved", 15) == 38);
+    assertTrue("Escaped tag not preserved: " + result.indexOf("reserved", 41), result.indexOf("reserved", 41) == 54);
+    assertTrue("Other tag should be removed", result.indexOf("other") == -1);
+  }
+
+  public void testMalformedHTML() throws Exception {
+    String test = "a <a hr<ef=aa<a>> </close</a>";
+    String gold = "a <a hr<ef=aa > </close ";
+    Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    StringBuilder builder = new StringBuilder();
+    int ch = 0;
+    while ((ch = reader.read()) != -1){
+      builder.append((char)ch);
+    }
+    String result = builder.toString();
+    // System.out.println("Resu: " + result + "<EOL>");
+    // System.out.println("Gold: " + gold + "<EOL>");
+    assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
+  }
+
+  public void testBufferOverflow() throws Exception {
+    StringBuilder testBuilder = new StringBuilder(LegacyHTMLStripCharFilter.DEFAULT_READ_AHEAD + 50);
+    testBuilder.append("ah<?> ??????");
+    appendChars(testBuilder, LegacyHTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
+    processBuffer(testBuilder.toString(), "Failed on pseudo proc. instr.");//processing instructions
+
+    testBuilder.setLength(0);
+    testBuilder.append("<!--");//comments
+    appendChars(testBuilder, 3*LegacyHTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);//comments have two lookaheads
+
+    testBuilder.append("-->foo");
+    processBuffer(testBuilder.toString(), "Failed w/ comment");
+
+    testBuilder.setLength(0);
+    testBuilder.append("<?");
+    appendChars(testBuilder, LegacyHTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
+    testBuilder.append("?>");
+    processBuffer(testBuilder.toString(), "Failed with proc. instr.");
+    
+    testBuilder.setLength(0);
+    testBuilder.append("<b ");
+    appendChars(testBuilder, LegacyHTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
+    testBuilder.append("/>");
+    processBuffer(testBuilder.toString(), "Failed on tag");
+
+  }
+
+  private void appendChars(StringBuilder testBuilder, int numChars) {
+    int i1 = numChars / 2;
+    for (int i = 0; i < i1; i++){
+      testBuilder.append('a').append(' ');//tack on enough to go beyond the mark readahead limit, since <?> makes LegacyHTMLStripCharFilter think it is a processing instruction
+    }
+  }  
+
+
+  private void processBuffer(String test, String assertMsg) throws IOException {
+    // System.out.println("-------------------processBuffer----------");
+    Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String (trimmed): " + builder.toString().trim() + "<EOS>");
+    }
+    assertTrue(assertMsg + "::: " + builder.toString() + " is not equal to " + test, builder.toString().equals(test) == true);
+  }
+
+  public void testComment() throws Exception {
+
+    String test = "<!--- three dashes, still a valid comment ---> ";
+    String gold = "  ";
+    Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertTrue(builder.toString() + " is not equal to " + gold + "<EOS>", builder.toString().equals(gold) == true);
+  }
+
+
+  public void doTestOffsets(String in) throws Exception {
+    LegacyHTMLStripCharFilter reader = new LegacyHTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(in))));
+    int ch = 0;
+    int off = 0;     // offset in the reader
+    int strOff = -1; // offset in the original string
+    while ((ch = reader.read()) != -1) {
+      int correctedOff = reader.correctOffset(off);
+
+      if (ch == 'X') {
+        strOff = in.indexOf('X',strOff+1);
+        assertEquals(strOff, correctedOff);
+      }
+
+      off++;
+    }
+  }
+
+  public void testOffsets() throws Exception {
+    doTestOffsets("hello X how X are you");
+    doTestOffsets("hello <p> X<p> how <p>X are you");
+    doTestOffsets("X &amp; X &#40; X &lt; &gt; X");
+
+    // test backtracking
+    doTestOffsets("X < &zz >X &# < X > < &l > &g < X");
+  }
+  
+  @Ignore("broken offsets: see LUCENE-2208")
+  public void testRandom() throws Exception {
+    Analyzer analyzer = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }
+
+      @Override
+      protected Reader initReader(Reader reader) {
+        return new LegacyHTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
+      }
+    };
+    
+    int numRounds = RANDOM_MULTIPLIER * 10000;
+    checkRandomData(random, analyzer, numRounds);
+  }
+
+  public void testRandomBrokenHTML() throws Exception {
+    int maxNumElements = 10000;
+    String text = _TestUtil.randomHtmlishString(random, maxNumElements);
+    Reader reader
+        = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(text)));
+    while (reader.read() != -1);
+  }
+
+  public void testRandomText() throws Exception {
+    StringBuilder text = new StringBuilder();
+    int minNumWords = 10;
+    int maxNumWords = 10000;
+    int minWordLength = 3;
+    int maxWordLength = 20;
+    int numWords = _TestUtil.nextInt(random, minNumWords, maxNumWords);
+    switch (_TestUtil.nextInt(random, 0, 4)) {
+      case 0: {
+        for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
+          text.append(_TestUtil.randomUnicodeString(random, maxWordLength));
+          text.append(' ');
+        }
+        break;
+      }
+      case 1: {
+        for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
+          text.append(_TestUtil.randomRealisticUnicodeString
+              (random, minWordLength, maxWordLength));
+          text.append(' ');
+        }
+        break;
+      }
+      default: { // ASCII 50% of the time
+        for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
+          text.append(_TestUtil.randomSimpleString(random));
+          text.append(' ');
+        }
+      }
+    }
+    Reader reader = new LegacyHTMLStripCharFilter
+        (CharReader.get(new StringReader(text.toString())));
+    while (reader.read() != -1);
+  }
+}
--- a/solr/core/src/test/org/apache/solr/analysis/TestHTMLStripCharFilterFactory.java
+++ b/solr/core/src/test/org/apache/solr/analysis/TestHTMLStripCharFilterFactory.java
@ -0,0 +1,130 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.CharReader;
+import org.apache.lucene.analysis.CharStream;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Simple tests to ensure this factory is working
+ */
+public class TestHTMLStripCharFilterFactory extends BaseTokenTestCase {
+
+
+  public void testNothingChanged() throws IOException {
+    //                             11111111112
+    //                   012345678901234567890
+    final String text = "this is only a test.";
+    HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("escapedTags", "a, Title");
+    factory.init(args);
+    CharStream cs = factory.create(CharReader.get(new StringReader(text)));
+    TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
+    assertTokenStreamContents(ts,
+        new String[] { "this", "is", "only", "a", "test." },
+        new int[] { 0, 5,  8, 13, 15 },
+        new int[] { 4, 7, 12, 14, 20 });
+  }
+
+  public void testNoEscapedTags() throws IOException {
+    //                             11111111112222222222333333333344
+    //                   012345678901234567890123456789012345678901
+    final String text = "<u>this</u> is <b>only</b> a <I>test</I>.";
+    HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    factory.init(args);
+    CharStream cs = factory.create(CharReader.get(new StringReader(text)));
+    TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
+    assertTokenStreamContents(ts,
+        new String[] { "this", "is", "only", "a", "test." },
+        new int[] {  3, 12, 18, 27, 32 },
+        new int[] { 11, 14, 26, 28, 41 });
+  }
+
+  public void testEscapedTags() throws IOException {
+    //                             11111111112222222222333333333344
+    //                   012345678901234567890123456789012345678901
+    final String text = "<u>this</u> is <b>only</b> a <I>test</I>.";
+    HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("escapedTags", "U i");
+    factory.init(args);
+    CharStream cs = factory.create(CharReader.get(new StringReader(text)));
+    TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
+    assertTokenStreamContents(ts,
+        new String[] { "<u>this</u>", "is", "only", "a", "<I>test</I>." },
+        new int[] {  0, 12, 18, 27, 29 },
+        new int[] { 11, 14, 26, 28, 41 });
+  }
+
+  public void testSeparatorOnlyEscapedTags() throws IOException {
+    //                             11111111112222222222333333333344
+    //                   012345678901234567890123456789012345678901
+    final String text = "<u>this</u> is <b>only</b> a <I>test</I>.";
+    HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("escapedTags", ",, , ");
+    factory.init(args);
+    CharStream cs = factory.create(CharReader.get(new StringReader(text)));
+    TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
+    assertTokenStreamContents(ts,
+        new String[] { "this", "is", "only", "a", "test." },
+        new int[] {  3, 12, 18, 27, 32 },
+        new int[] { 11, 14, 26, 28, 41 });
+  }
+
+  public void testEmptyEscapedTags() throws IOException {
+    //                             11111111112222222222333333333344
+    //                   012345678901234567890123456789012345678901
+    final String text = "<u>this</u> is <b>only</b> a <I>test</I>.";
+    HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("escapedTags", "");
+    factory.init(args);
+    CharStream cs = factory.create(CharReader.get(new StringReader(text)));
+    TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
+    assertTokenStreamContents(ts,
+        new String[] { "this", "is", "only", "a", "test." },
+        new int[] {  3, 12, 18, 27, 32 },
+        new int[] { 11, 14, 26, 28, 41 });
+  }
+
+  public void testSingleEscapedTag() throws IOException {
+    //                             11111111112222222222333333333344
+    //                   012345678901234567890123456789012345678901
+    final String text = "<u>this</u> is <b>only</b> a <I>test</I>.";
+    HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("escapedTags", ", B\r\n\t");
+    factory.init(args);
+    CharStream cs = factory.create(CharReader.get(new StringReader(text)));
+    TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
+    assertTokenStreamContents(ts,
+        new String[] { "this", "is", "<b>only</b>", "a", "test." },
+        new int[] {  3, 12, 15, 27, 32 },
+        new int[] { 11, 14, 26, 28, 41 });
+  }
+}
--- a/solr/core/src/test/org/apache/solr/analysis/htmlStripReaderTest.html
+++ b/solr/core/src/test/org/apache/solr/analysis/htmlStripReaderTest.html
@ -0,0 +1,350 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<meta content="Apache Forrest" name="Generator">
+<meta name="Forrest-version" content="0.8">
+<meta name="Forrest-skin-name" content="pelt">
+<title>Welcome to Solr</title>
+<link type="text/css" href="skin/basic.css" rel="stylesheet">
+<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
+<link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
+<link type="text/css" href="skin/profile.css" rel="stylesheet">
+<script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
+<link rel="shortcut icon" href="images/favicon.ico">
+</head>
+<body onload="init()">
+<script type="text/javascript">ndeSetTextSize();</script>
+<div id="top">
+<!--+
+    |breadtrail
+    +-->
+<div class="breadtrail">
+<a href="http://www.apache.org/">apache</a> &gt; <a href="http://lucene.apache.org/">lucene</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
+</div>
+<!--+
+    |header
+    +-->
+<div class="header">
+<!--+
+    |start group logo
+    +-->
+<div class="grouplogo">
+<a href="http://lucene.apache.org/"><img class="logoImage" alt="Lucene" src="images/lucene_green_150.gif" title="Apache Lucene"></a>
+</div>
+<!--+
+    |end group logo
+    +-->
+<!--+
+    |start Project Logo
+    +-->
+<div class="projectlogo">
+<a href="http://lucene.apache.org/solr/"><img class="logoImage" alt="Solr" src="images/solr_small.png" title="Solr Description"></a>
+</div>
+<!--+
+    |end Project Logo
+    +-->
+<!--+
+    |start Search
+    +-->
+<div class="searchbox">
+<form action="http://www.google.com/search" method="get" class="roundtopsmall">
+<input value="lucene.apache.org" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google">&nbsp; 
+                    <input name="Search" value="Search" type="submit">
+</form>
+</div>
+<!--+
+    |end search
+    +-->
+<!--+
+    |start Tabs
+    +-->
+<ul id="tabs">
+<li class="current">
+<a class="selected" href="index.html">Main</a>
+</li>
+<li>
+<a class="unselected" href="http://wiki.apache.org/solr">Wiki</a>
+</li>
+</ul>
+<!--+
+    |end Tabs
+    +-->
+</div>
+</div>
+<div id="main">
+<div id="publishedStrip">
+<!--+
+    |start Subtabs
+    +-->
+<div id="level2tabs"></div>
+<!--+
+    |end Endtabs
+    +-->
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+//  --></script>
+</div>
+<!--+
+    |breadtrail
+    +-->
+<div class="breadtrail">
+
+             &nbsp;
+           </div>
+<!--+
+    |start Menu, mainarea
+    +-->
+<!--+
+    |start Menu
+    +-->
+<div id="menu">
+<div onclick="SwitchMenu('menu_selected_1.1', 'skin/')" id="menu_selected_1.1Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">About</div>
+<div id="menu_selected_1.1" class="selectedmenuitemgroup" style="display: block;">
+<div class="menupage">
+<div class="menupagetitle">Welcome</div>
+</div>
+<div class="menuitem">
+<a href="who.html" title="Solr Committers">Who We Are</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle">Documentation</div>
+<div id="menu_1.2" class="menuitemgroup">
+<div class="menuitem">
+<a href="features.html">Features</a>
+</div>
+<div class="menuitem">
+<a href="tutorial.html">Tutorial</a>
+</div>
+<div class="menuitem">
+<a href="http://wiki.apache.org/solr/">Docs (Wiki)</a>
+</div>
+<div class="menuitem">
+<a href="http://wiki.apache.org/solr/FAQ">FAQ</a>
+</div>
+<div class="menuitem">
+<a href="api/index.html">javadoc</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle">Resources</div>
+<div id="menu_1.3" class="menuitemgroup">
+<div class="menuitem">
+<a href="http://www.apache.org/dyn/closer.cgi/lucene/solr/">Download</a>
+</div>
+<div class="menuitem">
+<a href="mailing_lists.html">Mailing Lists</a>
+</div>
+<div class="menuitem">
+<a href="issue_tracking.html">Issue Tracking</a>
+</div>
+<div class="menuitem">
+<a href="version_control.html">Version Control</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Related Projects</div>
+<div id="menu_1.4" class="menuitemgroup">
+<div class="menuitem">
+<a href="http://lucene.apache.org/java/">Lucene Java</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/nutch/">Nutch</a>
+</div>
+</div>
+<div id="credit">
+<hr>
+<a href="http://forrest.apache.org/"><img border="0" title="Built with Apache Forrest" alt="Built with Apache Forrest - logo" src="images/built-with-forrest-button.png" style="width: 88px;height: 31px;"></a>
+</div>
+<div id="roundbottom">
+<img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
+<!--+
+  |alternative credits
+  +-->
+<div id="credit2"></div>
+</div>
+<!--+
+    |end Menu
+    +-->
+<!--+
+    |start content
+    +-->
+<div id="content">
+<div title="Portable Document Format" class="pdflink">
+<a class="dida" href="index.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
+        PDF</a>
+</div>
+<h1>Welcome to Solr</h1>
+<div id="minitoc-area">
+<ul class="minitoc">
+<li>
+<a href="#intro">What Is Solr?</a>
+</li>
+<li>
+<a href="#news">News</a>
+<ul class="minitoc">
+<li>
+<a href="#02+October+2007+-+Solr+at+OSSummit+Asia">02 October 2007 - Solr at OSSummit Asia</a>
+</li>
+<li>
+<a href="#03+September+2007+-+Lucene+at+ApacheCon+Atlanta">03 September 2007 - Lucene at ApacheCon Atlanta</a>
+</li>
+<li>
+<a href="#06+June+2007%3A+Release+1.2+available">06 June 2007: Release 1.2 available</a>
+</li>
+<li>
+<a href="#17+January+2007%3A+Solr+graduates+from+Incubator">17 January 2007: Solr graduates from Incubator</a>
+</li>
+<li>
+<a href="#22+December+2006%3A+Release+1.1.0+available">22 December 2006: Release 1.1.0 available</a>
+</li>
+<li>
+<a href="#15+August+2006%3A+Solr+at+ApacheCon+US">15 August 2006: Solr at ApacheCon US</a>
+</li>
+<li>
+<a href="#21+April+2006%3A+Solr+at+ApacheCon">21 April 2006: Solr at ApacheCon</a>
+</li>
+<li>
+<a href="#21+February+2006%3A+nightly+builds">21 February 2006: nightly builds</a>
+</li>
+<li>
+<a href="#17+January+2006%3A+Solr+Joins+Apache+Incubator">17 January 2006: Solr Joins Apache Incubator</a>
+</li>
+</ul>
+</li>
+</ul>
+</div> 
+    
+<a name="N1000D"></a><a name="intro"></a>
+<h2 class="boxed">What Is Solr?</h2>
+<div class="section">
+<p>
+        Solr is an open source enterprise search server based on the
+        <a href="http://lucene.apache.org/java/">Lucene Java</a> search library, with XML/HTTP and JSON APIs,
+        hit highlighting, faceted search, caching, replication, and a web administration interface.
+        It runs in a Java servlet container such as <a href="http://tomcat.apache.org">Tomcat</a>.
+      </p>
+<p>
+        See the complete <a href="features.html">feature list</a> for more details, then check out the <a href="tutorial.html">tutorial</a>.
+      </p>
+</div>
+
+    
+<a name="N1002A"></a><a name="news"></a>
+<h2 class="boxed">News</h2>
+<div class="section">
+<a name="N10030"></a><a name="02+October+2007+-+Solr+at+OSSummit+Asia"></a>
+<h3 class="boxed">02 October 2007 - Solr at OSSummit Asia</h3>
+<p>
+<a href="http://www.ossummit.com"><img alt="OSSummit Asia logo" class="float-right" src="http://www.ossummit.com/2007/images/logo.png"></a>
+          Lucene and Solr tutorials!
+        </p>
+<p>The following talks and trainings are scheduled for the upcoming 2008 OSSummit:</p>
+<ul>
+            
+<li>
+<a href="http://www.ossummit.com/2007/program/talk/8">Lucene Boot Camp</a> by Erik Hatcher (originally by Grant Ingersoll).  An all-day training focusing on getting started with Lucene - the core library under Solr.</li>
+            
+<li>
+<a href="http://www.ossummit.com/2007/program/talk/25">Solr in a Day</a> by Erik Hatcher.  All you need to know to use Solr effectively.</li>
+            
+<li>
+<a href="http://www.ossummit.com/2007/program/talk/67">Lucene Case Studies</a> by Erik Hatcher.  A rapid series of examples of many Lucene and Solr using applications.</li>
+          
+</ul>
+<a name="N10058"></a><a name="03+September+2007+-+Lucene+at+ApacheCon+Atlanta"></a>
+<h3 class="boxed">03 September 2007 - Lucene at ApacheCon Atlanta</h3>
+<p>
+<a href="http://www.us.apachecon.com"><img alt="ApacheCon US logo" class="float-right" src="http://www.apache.org/ads/ApacheCon/2007-usa-125x125.png"></a>
+              Lucene will once again be well represented at ApacheCon USA in Atlanta this November 12-16, 2007.  
+            </p>
+<p>The following talks and trainings are scheduled for this year's conference:</p>
+<ul>
+                
+<li>November 12: <a href="http://us.apachecon.com/us2007/program/talk/1859">Lucene Boot Camp</a> by Grant Ingersoll.  An all-day training focusing on getting started with Lucene.</li>
+                
+<li>November 16, 9:00 am: <a href="http://us.apachecon.com/us2007/program/talk/1992">Apache Solr out of the Box</a> by Chris Hostetter. Introduction to Solr.</li>
+                
+<li>November 16, 10:00 am: <a href="http://us.apachecon.com/us2007/program/talk/1943">Building a Vertical Search Site using Apache Software</a> by Ken Krugler. Will cover many Lucene-based projects.</li>
+                
+<li>November 16, 3:00 pm: <a href="http://us.apachecon.com/us2007/program/talk/1953">Apache Lucene Performance</a> by Grant Ingersoll. Tips and techniques for improving Lucene performance.</li>
+                
+<li>November 16, 4:00 pm: <a href="http://us.apachecon.com/us2007/program/talk/2017"> Advanced Indexing Techniques with Apache Lucene</a> by Michael Busch. Information on payloads and advanced indexing techniques.</li>
+              
+</ul>
+<a name="N10091"></a><a name="06+June+2007%3A+Release+1.2+available"></a>
+<h3 class="boxed">06 June 2007: Release 1.2 available</h3>
+<p>
+        This is the first release since Solr graduated from the Incubator,
+        bringing many new features, including CSV/delimited-text data
+        loading, time based autocommit, faster faceting, negative filters,
+        a spell-check handler, sounds-like word filters, regex text filters,
+        and more flexible plugins.
+      </p>
+<p>See the <a href="http://svn.apache.org/repos/asf/lucene/solr/tags/release-1.2.0/CHANGES.txt">release notes</a> for more details.</p>
+<a name="N100A2"></a><a name="17+January+2007%3A+Solr+graduates+from+Incubator"></a>
+<h3 class="boxed">17 January 2007: Solr graduates from Incubator</h3>
+<p>
+        Solr has graduated from the Apache Incubator, and is now a sub-project of Lucene.
+      </p>
+<a name="N100AC"></a><a name="22+December+2006%3A+Release+1.1.0+available"></a>
+<h3 class="boxed">22 December 2006: Release 1.1.0 available</h3>
+<p>
+        This is the first release since Solr joined the Incubator, and brings
+        many new features and performance optimizations including highlighting,
+        faceted search, and JSON/Python/Ruby response formats.
+      </p>
+<a name="N100B6"></a><a name="15+August+2006%3A+Solr+at+ApacheCon+US"></a>
+<h3 class="boxed">15 August 2006: Solr at ApacheCon US</h3>
+<p>Chris Hostetter will be presenting
+        <strong><a href="http://www.apachecon.com/2006/US/html/sessions.html#FR26">"Faceted Searching With Apache Solr"</a></strong>  
+        at ApacheCon US 2006, on October 13th at 4:30pm.
+        See the <a href="http://www.us.apachecon.com/">ApacheCon</a> website for more details.
+      </p>
+<a name="N100C9"></a><a name="21+April+2006%3A+Solr+at+ApacheCon"></a>
+<h3 class="boxed">21 April 2006: Solr at ApacheCon</h3>
+<p>Yonik Seeley will be presenting
+        <strong>"Apache Solr, a Full-Text Search Server based on Lucene"</strong>  
+        at ApacheCon Europe 2006, on June 29th at 5:30pm.
+        See the <a href="http://www.eu.apachecon.com/">ApacheCon</a> website for more details.
+      </p>
+<a name="N100DA"></a><a name="21+February+2006%3A+nightly+builds"></a>
+<h3 class="boxed">21 February 2006: nightly builds</h3>
+<p>Solr now has nightly builds.  This automatically creates a
+      <a href="http://people.apache.org/builds/lucene/solr/nightly/">downloadable version of Solr every
+      night</a>.  All unit tests must pass, or a message is sent to
+      the developers mailing list and no new version is created.  This
+      also updates the <a href="api/index.html">javadoc</a>.</p>
+<a name="N100EC"></a><a name="17+January+2006%3A+Solr+Joins+Apache+Incubator"></a>
+<h3 class="boxed">17 January 2006: Solr Joins Apache Incubator</h3>
+<p>Solr, a search server based on Lucene, has been accepted into the Apache Incubator.
+            Solr was originally developed by CNET Networks, and is widely used within CNET
+            to provide high relevancy search and faceted browsing capabilities.
+            </p>
+</div>
+
+  
+</div>
+<!--+
+    |end content
+    +-->
+<div class="clearboth">&nbsp;</div>
+</div>
+<div id="footer">
+<!--+
+    |start bottomstrip
+    +-->
+<div class="lastmodified">
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+//  --></script>
+</div>
+<div class="copyright">
+        Copyright &copy;
+         2007 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
+</div>
+<div id="logos"></div>
+<!--+
+    |end bottomstrip
+    +-->
+</div>
+</body>
+</html>
--- a/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
+++ b/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
@ -326,8 +326,8 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB
    NamedList indexPart = textType.get("index");
    assertNotNull("expecting an index token analysis for field type 'charfilthtmlmap'", indexPart);
    
-    assertEquals("  whátëvêr  ", indexPart.get("org.apache.lucene.analysis.charfilter.HTMLStripCharFilter"));
-    assertEquals("  whatever  ", indexPart.get("org.apache.lucene.analysis.charfilter.MappingCharFilter"));
+    assertEquals("\n\nwhátëvêr\n\n", indexPart.get("org.apache.lucene.analysis.charfilter.HTMLStripCharFilter"));
+    assertEquals("\n\nwhatever\n\n", indexPart.get("org.apache.lucene.analysis.charfilter.MappingCharFilter"));

    List<NamedList> tokenList = (List<NamedList>)indexPart.get(MockTokenizer.class.getName());
    assertNotNull("Expecting MockTokenizer analysis breakdown", tokenList);