merge trunk (1233476:1235908)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3661@1235919 13f79535-47bb-0310-9956-ffa450edef68
2012-01-25 20:32:44 +00:00 · 2012-01-25 20:32:44 +00:00 · 58e5ec6979
parent f6c6f5dd4c 9fed484fb1
commit 58e5ec6979
282 changed files with 55892 additions and 4375 deletions
--- a/dev-tools/eclipse/dot.classpath
+++ b/dev-tools/eclipse/dot.classpath
@ -100,7 +100,7 @@
 	<classpathentry kind="lib" path="modules/benchmark/lib/commons-digester-1.7.jar"/>
 	<classpathentry kind="lib" path="modules/benchmark/lib/commons-logging-1.0.4.jar"/>
 	<classpathentry kind="lib" path="modules/benchmark/lib/xercesImpl-2.9.1-patched-XERCESJ-1257.jar"/>
-	<classpathentry kind="lib" path="solr/lib/apache-solr-noggit-r1209632.jar"/>
+	<classpathentry kind="lib" path="solr/lib/apache-solr-noggit-r1211150.jar"/>
 	<classpathentry kind="lib" path="solr/lib/commons-csv-1.0-SNAPSHOT-r966014.jar"/>
 	<classpathentry kind="lib" path="solr/lib/commons-fileupload-1.2.1.jar"/>
 	<classpathentry kind="lib" path="solr/lib/commons-httpclient-3.1.jar"/>
@ -115,7 +115,7 @@
 	<classpathentry kind="lib" path="solr/lib/slf4j-api-1.6.1.jar"/>
 	<classpathentry kind="lib" path="solr/lib/slf4j-jdk14-1.6.1.jar"/>
 	<classpathentry kind="lib" path="solr/lib/wstx-asl-3.2.7.jar"/>
-	<classpathentry kind="lib" path="solr/lib/zookeeper-3.3.3.jar"/>
+	<classpathentry kind="lib" path="solr/lib/zookeeper-3.3.4.jar"/>
 	<classpathentry kind="lib" path="solr/example/lib/jetty-6.1.26-patched-JETTY-1340.jar"/>
 	<classpathentry kind="lib" path="solr/example/lib/jetty-util-6.1.26-patched-JETTY-1340.jar"/>
 	<classpathentry kind="lib" path="solr/example/lib/servlet-api-2.5-20081211.jar"/>
@ -136,7 +136,7 @@
 	<classpathentry kind="lib" path="solr/contrib/extraction/lib/bcmail-jdk15-1.45.jar"/>
 	<classpathentry kind="lib" path="solr/contrib/extraction/lib/bcprov-jdk15-1.45.jar"/>
 	<classpathentry kind="lib" path="solr/contrib/extraction/lib/boilerpipe-1.1.0.jar"/>
-	<classpathentry kind="lib" path="solr/contrib/extraction/lib/commons-compress-1.2.jar"/>
+	<classpathentry kind="lib" path="solr/contrib/extraction/lib/commons-compress-1.3.jar"/>
 	<classpathentry kind="lib" path="solr/contrib/extraction/lib/dom4j-1.6.1.jar"/>
 	<classpathentry kind="lib" path="solr/contrib/extraction/lib/fontbox-1.6.0.jar"/>
 	<classpathentry kind="lib" path="solr/contrib/extraction/lib/jempbox-1.6.0.jar"/>
@ -149,8 +149,8 @@
 	<classpathentry kind="lib" path="solr/contrib/extraction/lib/poi-scratchpad-3.8-beta4.jar"/>
 	<classpathentry kind="lib" path="solr/contrib/extraction/lib/rome-0.9.jar"/>
 	<classpathentry kind="lib" path="solr/contrib/extraction/lib/tagsoup-1.2.1.jar"/>
-	<classpathentry kind="lib" path="solr/contrib/extraction/lib/tika-core-0.10.jar"/>
-	<classpathentry kind="lib" path="solr/contrib/extraction/lib/tika-parsers-0.10.jar"/>
+	<classpathentry kind="lib" path="solr/contrib/extraction/lib/tika-core-1.0.jar"/>
+	<classpathentry kind="lib" path="solr/contrib/extraction/lib/tika-parsers-1.0.jar"/>
 	<classpathentry kind="lib" path="solr/contrib/extraction/lib/xmlbeans-2.3.0.jar"/>
 	<classpathentry kind="lib" path="solr/contrib/langid/lib/langdetect-r111.jar"/>
 	<classpathentry kind="lib" path="solr/contrib/langid/lib/jsonic-1.2.0.jar"/>
--- a/dev-tools/maven/pom.xml.template
+++ b/dev-tools/maven/pom.xml.template
@ -45,7 +45,7 @@
    <jetty.version>6.1.26</jetty.version>
    <patched.jetty.version>6.1.26-patched-JETTY-1340</patched.jetty.version>
    <slf4j.version>1.6.1</slf4j.version>
-    <tika.version>0.10</tika.version>
+    <tika.version>1.0</tika.version>
  </properties>
  <issueManagement>
    <system>JIRA</system>
@ -283,7 +283,7 @@
      <dependency>
        <groupId>org.apache.zookeeper</groupId>
        <artifactId>zookeeper</artifactId>
-        <version>3.3.3</version>
+        <version>3.3.4</version>
      </dependency>
      <dependency>
        <groupId>org.carrot2</groupId>
@ -362,6 +362,19 @@
      </dependency>
    </dependencies>
  </dependencyManagement>
+  <dependencies>
+    <dependency> 
+      <!-- Maven 2.2.X has a bug that omits as duplicate all JUnit         -->
+      <!-- dependencies from the classpath when compiling solr-clustering, -->
+      <!-- causing test compilation to fail.  Maven 3.0.4 test compilation -->
+      <!-- succeeds with the exact same dependencies, so apparently the    -->
+      <!-- bug has been fixed.  This dependency can be removed when the    -->
+      <!-- minimum Maven version is upgraded to 3.0.4+.                    -->
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
  <build>
    <directory>lucene/build/lucene-parent</directory>
    <pluginManagement>
@ -385,6 +398,11 @@
            <target>${java.compat.version}</target>
          </configuration>
        </plugin>
+        <plugin>
+          <groupId>org.apache.maven.plugins</groupId>
+          <artifactId>maven-dependency-plugin</artifactId>
+          <version>2.4</version>
+        </plugin>
        <plugin>
          <groupId>org.apache.maven.plugins</groupId>
          <artifactId>maven-deploy-plugin</artifactId>
@ -652,7 +670,7 @@
                  <artifactId>solr-noggit</artifactId>
                  <version>${project.version}</version>
                  <packaging>jar</packaging>
-                  <file>solr/lib/apache-solr-noggit-r1209632.jar</file>
+                  <file>solr/lib/apache-solr-noggit-r1211150.jar</file>
                </configuration>  
              </execution>
              <execution>
--- a/dev-tools/maven/solr/core/pom.xml.template
+++ b/dev-tools/maven/solr/core/pom.xml.template
@ -202,6 +202,12 @@
      <testResource>
        <directory>src/test-files</directory>
      </testResource>
+      <testResource>
+        <directory>${project.build.testSourceDirectory}</directory>
+        <excludes>
+          <exclude>**/*.java</exclude>
+        </excludes>
+      </testResource>
      <testResource>
        <directory>../solrj/src/test-files</directory>
      </testResource>
--- a/dev-tools/maven/solr/solrj/pom.xml.template
+++ b/dev-tools/maven/solr/solrj/pom.xml.template
@ -113,6 +113,13 @@
          <skip>true</skip> <!-- Tests are run from solr-core module -->
        </configuration>
      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <configuration>
+          <skip>true</skip> <!-- This skips test compilation - tests are run from solr-core module -->
+        </configuration>
+      </plugin>
    </plugins>
  </build>
 </project>
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -742,6 +742,9 @@ Changes in backwards compatibility policy
  behavior.  Added seekExact() to FSTEnum, and added FST.save/read 
  from a File. (Mike McCandless, Dawid Weiss, Robert Muir)
  
+* LUCENE-3712: Removed unused and untested ReaderUtil#subReader methods.
+  (Uwe Schindler)
+  
 Security fixes

 * LUCENE-3588: Try harder to prevent SIGSEGV on cloned MMapIndexInputs:
@ -790,6 +793,12 @@ New Features
  input mapping to it) for FSTs that have strictly monotonic long
  outputs (such as an ord).  (Mike McCandless)
  
+* LUCENE-3671: Add TypeTokenFilter that filters tokens based on
+  their TypeAttribute.  (Tommaso Teofili via Uwe Schindler)
+
+* LUCENE-3690: Added HTMLStripCharFilter, a CharFilter that strips HTML
+  markup. (Steve Rowe)
+  
 Bug fixes

 * LUCENE-3595: Fixed FieldCacheRangeFilter and FieldCacheTermsFilter
@ -808,9 +817,11 @@ Bug fixes
 * LUCENE-3641: Fixed MultiReader to correctly propagate readerFinishedListeners
  to clones/reopened readers.  (Uwe Schindler)

-* LUCENE-3642: Fixed bugs in CharTokenizer, n-gram filters, and smart chinese 
-  where they would create invalid offsets in some situations, leading to problems
-  in highlighting. (Max Beutel via Robert Muir)
+* LUCENE-3642, SOLR-2891, LUCENE-3717: Fixed bugs in CharTokenizer, n-gram tokenizers/filters, 
+  compound token filters, thai word filter, icutokenizer, pattern analyzer, 
+  wikipediatokenizer, and smart chinese where they would create invalid offsets in 
+  some situations, leading to problems in highlighting.  
+  (Max Beutel, Edwin Steiner via Robert Muir)

 * LUCENE-3639: TopDocs.merge was incorrectly setting TopDocs.maxScore to
  Float.MIN_VALUE when it should be Float.NaN, when there were 0
@ -825,6 +836,12 @@ Bug fixes
 * LUCENE-3605: don't sleep in a retry loop when trying to locate the
  segments_N file (Robert Muir, Mike McCandless)

+* LUCENE-3711: SentinelIntSet with a small initial size can go into
+  an infinite loop when expanded.  This can affect grouping using
+  TermAllGroupsCollector or TermAllGroupHeadsCollector if instantiated with a
+  non default small size. (Martijn van Groningen, yonik)
+
+
 Optimizations

 * LUCENE-3653: Improve concurrency in VirtualMethod and AttributeSource by
--- a/lucene/src/java/org/apache/lucene/analysis/Analyzer.java
+++ b/lucene/src/java/org/apache/lucene/analysis/Analyzer.java
@ -52,12 +52,12 @@ public abstract class Analyzer {
   * @param fieldName
   *          the name of the fields content passed to the
   *          {@link TokenStreamComponents} sink as a reader
-   * @param aReader
+   * @param reader
   *          the reader passed to the {@link Tokenizer} constructor
   * @return the {@link TokenStreamComponents} for this analyzer.
   */
  protected abstract TokenStreamComponents createComponents(String fieldName,
-      Reader aReader);
+      Reader reader);

  /**
   * Creates a TokenStream that is allowed to be re-use from the previous time
--- a/lucene/src/java/org/apache/lucene/codecs/lucene3x/SegmentTermDocs.java
+++ b/lucene/src/java/org/apache/lucene/codecs/lucene3x/SegmentTermDocs.java
@ -206,7 +206,7 @@ public class SegmentTermDocs {
        skipListReader = new Lucene40SkipListReader((IndexInput) freqStream.clone(), maxSkipLevels, skipInterval); // lazily clone

      if (!haveSkipped) {                          // lazily initialize skip stream
-        skipListReader.init(skipPointer, freqBasePointer, proxBasePointer, df, currentFieldStoresPayloads);
+        skipListReader.init(skipPointer, freqBasePointer, proxBasePointer, df, currentFieldStoresPayloads, false);
        haveSkipped = true;
      }

--- a/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosReader.java
+++ b/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosReader.java
@ -85,11 +85,11 @@ public class Lucene40FieldInfosReader extends FieldInfosReader {
        // LUCENE-3027: past indices were able to write
        // storePayloads=true when omitTFAP is also true,
        // which is invalid.  We correct that, here:
-        if (indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
+        if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
          storePayloads = false;
        }
        hasVectors |= storeTermVector;
-        hasProx |= isIndexed && indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
+        hasProx |= isIndexed && indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
        hasFreq |= isIndexed && indexOptions != IndexOptions.DOCS_ONLY;
        // DV Types are packed in one byte
        byte val = input.readByte();
--- a/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosWriter.java
+++ b/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40FieldInfosWriter.java
@ -58,7 +58,7 @@ public class Lucene40FieldInfosWriter extends FieldInfosWriter {
      output.writeVInt(FORMAT_CURRENT);
      output.writeVInt(infos.size());
      for (FieldInfo fi : infos) {
-        assert fi.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS || !fi.storePayloads;
+        assert fi.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !fi.storePayloads;
        byte bits = 0x0;
        if (fi.isIndexed) bits |= IS_INDEXED;
        if (fi.storeTermVector) bits |= STORE_TERMVECTOR;
--- a/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java
+++ b/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java
@ -197,7 +197,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
      // undefined
    }

-    if (fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
+    if (fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
      if (isFirstTerm) {
        termState.proxOffset = termState.bytesReader.readVLong();
      } else {
@ -245,23 +245,23 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
                                               DocsAndPositionsEnum reuse, boolean needsOffsets)
    throws IOException {

-    if (needsOffsets) {
-      // TODO: once we index offsets into postings fix this!
-      return null;
+    boolean hasOffsets = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
+    if (needsOffsets && !hasOffsets) {
+      return null; // not available
    }

    // TODO: refactor
-    if (fieldInfo.storePayloads) {
-      SegmentDocsAndPositionsAndPayloadsEnum docsEnum;
-      if (reuse == null || !(reuse instanceof SegmentDocsAndPositionsAndPayloadsEnum)) {
-        docsEnum = new SegmentDocsAndPositionsAndPayloadsEnum(freqIn, proxIn);
+    if (fieldInfo.storePayloads || hasOffsets) {
+      SegmentFullPositionsEnum docsEnum;
+      if (reuse == null || !(reuse instanceof SegmentFullPositionsEnum)) {
+        docsEnum = new SegmentFullPositionsEnum(freqIn, proxIn);
      } else {
-        docsEnum = (SegmentDocsAndPositionsAndPayloadsEnum) reuse;
+        docsEnum = (SegmentFullPositionsEnum) reuse;
        if (docsEnum.startFreqIn != freqIn) {
          // If you are using ParellelReader, and pass in a
          // reused DocsEnum, it could have come from another
          // reader also using standard codec
-          docsEnum = new SegmentDocsAndPositionsAndPayloadsEnum(freqIn, proxIn);
+          docsEnum = new SegmentFullPositionsEnum(freqIn, proxIn);
        }
      }
      return docsEnum.reset(fieldInfo, (StandardTermState) termState, liveDocs);
@ -295,6 +295,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
    
    protected boolean indexOmitsTF;                               // does current field omit term freq?
    protected boolean storePayloads;                        // does current field store payloads?
+    protected boolean storeOffsets;                         // does current field store offsets?

    protected int limit;                                    // number of docs in this posting
    protected int ord;                                      // how many docs we've read
@ -324,6 +325,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
    DocsEnum reset(FieldInfo fieldInfo, StandardTermState termState) throws IOException {
      indexOmitsTF = fieldInfo.indexOptions == IndexOptions.DOCS_ONLY;
      storePayloads = fieldInfo.storePayloads;
+      storeOffsets = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
      freqOffset = termState.freqOffset;
      skipOffset = termState.skipOffset;

@ -471,7 +473,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {

          skipper.init(freqOffset + skipOffset,
                       freqOffset, 0,
-                       limit, storePayloads);
+                       limit, storePayloads, storeOffsets);

          skipped = true;
        }
@ -519,7 +521,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
          return doc = docs[i];
        }
      }
-      return refill();
+      return doc = refill();
    }

    @Override
@ -602,7 +604,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
          return doc = docs[i];
        }
      }
-      return refill();
+      return doc = refill();
    }
    
    @Override
@ -665,7 +667,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
  
  // TODO specialize DocsAndPosEnum too
  
-  // Decodes docs & positions. payloads are not present.
+  // Decodes docs & positions. payloads nor offsets are present.
  private final class SegmentDocsAndPositionsEnum extends DocsAndPositionsEnum {
    final IndexInput startFreqIn;
    private final IndexInput freqIn;
@ -792,7 +794,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {

          skipper.init(freqOffset+skipOffset,
                       freqOffset, proxOffset,
-                       limit, false);
+                       limit, false, false);

          skipped = true;
        }
@ -868,8 +870,8 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
    }
  }
  
-  // Decodes docs & positions & payloads
-  private class SegmentDocsAndPositionsAndPayloadsEnum extends DocsAndPositionsEnum {
+  // Decodes docs & positions & (payloads and/or offsets)
+  private class SegmentFullPositionsEnum extends DocsAndPositionsEnum {
    final IndexInput startFreqIn;
    private final IndexInput freqIn;
    private final IndexInput proxIn;
@ -895,16 +897,24 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
    Lucene40SkipListReader skipper;
    private BytesRef payload;
    private long lazyProxPointer;
+    
+    boolean storePayloads;
+    boolean storeOffsets;
+    
+    int offsetLength;
+    int startOffset;

-    public SegmentDocsAndPositionsAndPayloadsEnum(IndexInput freqIn, IndexInput proxIn) throws IOException {
+    public SegmentFullPositionsEnum(IndexInput freqIn, IndexInput proxIn) throws IOException {
      startFreqIn = freqIn;
      this.freqIn = (IndexInput) freqIn.clone();
      this.proxIn = (IndexInput) proxIn.clone();
    }

-    public SegmentDocsAndPositionsAndPayloadsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits liveDocs) throws IOException {
-      assert fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
-      assert fieldInfo.storePayloads;
+    public SegmentFullPositionsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits liveDocs) throws IOException {
+      storeOffsets = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
+      storePayloads = fieldInfo.storePayloads;
+      assert fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
+      assert storePayloads || storeOffsets;
      if (payload == null) {
        payload = new BytesRef();
        payload.bytes = new byte[1];
@ -923,6 +933,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
      doc = -1;
      accum = 0;
      position = 0;
+      startOffset = 0;

      skipped = false;
      posPendingCount = 0;
@ -963,6 +974,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
      }

      position = 0;
+      startOffset = 0;

      //System.out.println("StandardR.D&PE nextDoc seg=" + segment + " return doc=" + doc);
      return (doc = accum);
@ -1001,7 +1013,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
          //System.out.println("  init skipper freqOffset=" + freqOffset + " skipOffset=" + skipOffset + " vs len=" + freqIn.length());
          skipper.init(freqOffset+skipOffset,
                       freqOffset, proxOffset,
-                       limit, true);
+                       limit, storePayloads, storeOffsets);

          skipped = true;
        }
@ -1016,8 +1028,10 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
          lazyProxPointer = skipper.getProxPointer();
          posPendingCount = 0;
          position = 0;
+          startOffset = 0;
          payloadPending = false;
          payloadLength = skipper.getPayloadLength();
+          offsetLength = skipper.getOffsetLength();
        }
      }
        
@ -1038,27 +1052,38 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
      }
      
      if (payloadPending && payloadLength > 0) {
-        // payload of last position as never retrieved -- skip it
+        // payload of last position was never retrieved -- skip it
        proxIn.seek(proxIn.getFilePointer() + payloadLength);
        payloadPending = false;
      }

      // scan over any docs that were iterated without their positions
      while(posPendingCount > freq) {
-
        final int code = proxIn.readVInt();

-        if ((code & 1) != 0) {
-          // new payload length
-          payloadLength = proxIn.readVInt();
-          assert payloadLength >= 0;
+        if (storePayloads) {
+          if ((code & 1) != 0) {
+            // new payload length
+            payloadLength = proxIn.readVInt();
+            assert payloadLength >= 0;
+          }
+          assert payloadLength != -1;
        }
        
-        assert payloadLength != -1;
-        proxIn.seek(proxIn.getFilePointer() + payloadLength);
+        if (storeOffsets) {
+          if ((proxIn.readVInt() & 1) != 0) {
+            // new offset length
+            offsetLength = proxIn.readVInt();
+          }
+        }
+        
+        if (storePayloads) {
+          proxIn.seek(proxIn.getFilePointer() + payloadLength);
+        }

        posPendingCount--;
        position = 0;
+        startOffset = 0;
        payloadPending = false;
        //System.out.println("StandardR.D&PE skipPos");
      }
@ -1069,16 +1094,28 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
        proxIn.seek(proxIn.getFilePointer()+payloadLength);
      }

-      final int code = proxIn.readVInt();
-      if ((code & 1) != 0) {
-        // new payload length
-        payloadLength = proxIn.readVInt();
-        assert payloadLength >= 0;
-      }
-      assert payloadLength != -1;
+      int code = proxIn.readVInt();
+      if (storePayloads) {
+        if ((code & 1) != 0) {
+          // new payload length
+          payloadLength = proxIn.readVInt();
+          assert payloadLength >= 0;
+        }
+        assert payloadLength != -1;
          
-      payloadPending = true;
-      position += code >>> 1;
+        payloadPending = true;
+        code >>>= 1;
+      }
+      position += code;
+      
+      if (storeOffsets) {
+        int offsetCode = proxIn.readVInt();
+        if ((offsetCode & 1) != 0) {
+          // new offset length
+          offsetLength = proxIn.readVInt();
+        }
+        startOffset += offsetCode >>> 1;
+      }

      posPendingCount--;

@ -1090,32 +1127,36 @@ public class Lucene40PostingsReader extends PostingsReaderBase {

    @Override
    public int startOffset() throws IOException {
-      return -1;
+      return storeOffsets ? startOffset : -1;
    }

    @Override
    public int endOffset() throws IOException {
-      return -1;
+      return storeOffsets ? startOffset + offsetLength : -1;
    }

    /** Returns the payload at this position, or null if no
     *  payload was indexed. */
    @Override
    public BytesRef getPayload() throws IOException {
-      assert lazyProxPointer == -1;
-      assert posPendingCount < freq;
-      if (!payloadPending) {
-        throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once.");
-      }
-      if (payloadLength > payload.bytes.length) {
-        payload.grow(payloadLength);
-      }
+      if (storePayloads) {
+        assert lazyProxPointer == -1;
+        assert posPendingCount < freq;
+        if (!payloadPending) {
+          throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once.");
+        }
+        if (payloadLength > payload.bytes.length) {
+          payload.grow(payloadLength);
+        }

-      proxIn.readBytes(payload.bytes, 0, payloadLength);
-      payload.length = payloadLength;
-      payloadPending = false;
+        proxIn.readBytes(payload.bytes, 0, payloadLength);
+        payload.length = payloadLength;
+        payloadPending = false;

-      return payload;
+        return payload;
+      } else {
+        throw new IOException("No payloads exist for this field!");
+      }
    }

    @Override
--- a/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java
+++ b/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java
@ -73,12 +73,15 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {

  IndexOptions indexOptions;
  boolean storePayloads;
+  boolean storeOffsets;
  // Starts a new term
  long freqStart;
  long proxStart;
  FieldInfo fieldInfo;
  int lastPayloadLength;
+  int lastOffsetLength;
  int lastPosition;
+  int lastOffset;

  // private String segment;

@ -137,6 +140,8 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
      proxStart = proxOut.getFilePointer();
      // force first payload to write its length
      lastPayloadLength = -1;
+      // force first offset to write its length
+      lastOffsetLength = -1;
    }
    skipListWriter.resetSkip();
  }
@ -155,10 +160,8 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
    */
    this.fieldInfo = fieldInfo;
    indexOptions = fieldInfo.indexOptions;
-    if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
-      throw new UnsupportedOperationException("this codec cannot index offsets");
-    }
-        
+    
+    storeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;        
    storePayloads = fieldInfo.storePayloads;
    //System.out.println("  set init blockFreqStart=" + freqStart);
    //System.out.println("  set init blockProxStart=" + proxStart);
@ -180,7 +183,7 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
    }

    if ((++df % skipInterval) == 0) {
-      skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength);
+      skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength, storeOffsets, lastOffsetLength);
      skipListWriter.bufferSkip(df);
    }

@ -197,31 +200,26 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
    }

    lastPosition = 0;
+    lastOffset = 0;
  }

  /** Add a new position & payload */
  @Override
  public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
    //if (DEBUG) System.out.println("SPW:     addPos pos=" + position + " payload=" + (payload == null ? "null" : (payload.length + " bytes")) + " proxFP=" + proxOut.getFilePointer());
-    assert indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS: "invalid indexOptions: " + indexOptions;
+    assert indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 : "invalid indexOptions: " + indexOptions;
    assert proxOut != null;

-    // TODO: when we add offsets... often
-    // endOffset-startOffset will be constant or near
-    // constant for all docs (eg if the term wasn't stemmed
-    // then this will usually be the utf16 length of the
-    // term); would be nice to write that length once up
-    // front and then not encode endOffset for each
-    // position..
-
    final int delta = position - lastPosition;
    
    assert delta >= 0: "position=" + position + " lastPosition=" + lastPosition;            // not quite right (if pos=0 is repeated twice we don't catch it)

    lastPosition = position;

+    int payloadLength = 0;
+
    if (storePayloads) {
-      final int payloadLength = payload == null ? 0 : payload.length;
+      payloadLength = payload == null ? 0 : payload.length;

      if (payloadLength != lastPayloadLength) {
        lastPayloadLength = payloadLength;
@ -230,13 +228,28 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
      } else {
        proxOut.writeVInt(delta << 1);
      }
-
-      if (payloadLength > 0) {
-        proxOut.writeBytes(payload.bytes, payload.offset, payloadLength);
-      }
    } else {
      proxOut.writeVInt(delta);
    }
+    
+    if (storeOffsets) {
+      // don't use startOffset - lastEndOffset, because this creates lots of negative vints for synonyms,
+      // and the numbers aren't that much smaller anyways.
+      int offsetDelta = startOffset - lastOffset;
+      int offsetLength = endOffset - startOffset;
+      if (offsetLength != lastOffsetLength) {
+        proxOut.writeVInt(offsetDelta << 1 | 1);
+        proxOut.writeVInt(offsetLength);
+      } else {
+        proxOut.writeVInt(offsetDelta << 1);
+      }
+      lastOffset = startOffset;
+      lastOffsetLength = offsetLength;
+    }
+    
+    if (payloadLength > 0) {
+      proxOut.writeBytes(payload.bytes, payload.offset, payloadLength);
+    }
  }

  @Override
@ -304,7 +317,7 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
      assert firstTerm.skipOffset > 0;
      bytesWriter.writeVInt(firstTerm.skipOffset);
    }
-    if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
+    if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
      bytesWriter.writeVLong(firstTerm.proxStart);
    }
    long lastFreqStart = firstTerm.freqStart;
@ -319,7 +332,7 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
        assert term.skipOffset > 0;
        bytesWriter.writeVInt(term.skipOffset);
      }
-      if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
+      if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
        bytesWriter.writeVLong(term.proxStart - lastProxStart);
        lastProxStart = term.proxStart;
      }
--- a/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListReader.java
+++ b/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListReader.java
@ -30,13 +30,16 @@ import org.apache.lucene.store.IndexInput;
 */
 public class Lucene40SkipListReader extends MultiLevelSkipListReader {
  private boolean currentFieldStoresPayloads;
+  private boolean currentFieldStoresOffsets;
  private long freqPointer[];
  private long proxPointer[];
  private int payloadLength[];
+  private int offsetLength[];
  
  private long lastFreqPointer;
  private long lastProxPointer;
  private int lastPayloadLength;
+  private int lastOffsetLength;
                           

  public Lucene40SkipListReader(IndexInput skipStream, int maxSkipLevels, int skipInterval) {
@ -44,17 +47,20 @@ public class Lucene40SkipListReader extends MultiLevelSkipListReader {
    freqPointer = new long[maxSkipLevels];
    proxPointer = new long[maxSkipLevels];
    payloadLength = new int[maxSkipLevels];
+    offsetLength = new int[maxSkipLevels];
  }

-  public void init(long skipPointer, long freqBasePointer, long proxBasePointer, int df, boolean storesPayloads) {
+  public void init(long skipPointer, long freqBasePointer, long proxBasePointer, int df, boolean storesPayloads, boolean storesOffsets) {
    super.init(skipPointer, df);
    this.currentFieldStoresPayloads = storesPayloads;
+    this.currentFieldStoresOffsets = storesOffsets;
    lastFreqPointer = freqBasePointer;
    lastProxPointer = proxBasePointer;

    Arrays.fill(freqPointer, freqBasePointer);
    Arrays.fill(proxPointer, proxBasePointer);
    Arrays.fill(payloadLength, 0);
+    Arrays.fill(offsetLength, 0);
  }

  /** Returns the freq pointer of the doc to which the last call of 
@ -76,12 +82,20 @@ public class Lucene40SkipListReader extends MultiLevelSkipListReader {
    return lastPayloadLength;
  }
  
+  /** Returns the offset length (endOffset-startOffset) of the position stored just before 
+   * the doc to which the last call of {@link MultiLevelSkipListReader#skipTo(int)} 
+   * has skipped.  */
+  public int getOffsetLength() {
+    return lastOffsetLength;
+  }
+  
  @Override
  protected void seekChild(int level) throws IOException {
    super.seekChild(level);
    freqPointer[level] = lastFreqPointer;
    proxPointer[level] = lastProxPointer;
    payloadLength[level] = lastPayloadLength;
+    offsetLength[level] = lastOffsetLength;
  }
  
  @Override
@ -90,6 +104,7 @@ public class Lucene40SkipListReader extends MultiLevelSkipListReader {
    lastFreqPointer = freqPointer[level];
    lastProxPointer = proxPointer[level];
    lastPayloadLength = payloadLength[level];
+    lastOffsetLength = offsetLength[level];
  }


@ -110,6 +125,11 @@ public class Lucene40SkipListReader extends MultiLevelSkipListReader {
    } else {
      delta = skipStream.readVInt();
    }
+
+    if (currentFieldStoresOffsets) {
+      offsetLength[level] = skipStream.readVInt();
+    }
+
    freqPointer[level] += skipStream.readVInt();
    proxPointer[level] += skipStream.readVInt();
    
--- a/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListWriter.java
+++ b/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40SkipListWriter.java
@ -40,7 +40,9 @@ public class Lucene40SkipListWriter extends MultiLevelSkipListWriter {

  private int curDoc;
  private boolean curStorePayloads;
+  private boolean curStoreOffsets;
  private int curPayloadLength;
+  private int curOffsetLength;
  private long curFreqPointer;
  private long curProxPointer;

@ -58,10 +60,12 @@ public class Lucene40SkipListWriter extends MultiLevelSkipListWriter {
  /**
   * Sets the values for the current skip data. 
   */
-  public void setSkipData(int doc, boolean storePayloads, int payloadLength) {
+  public void setSkipData(int doc, boolean storePayloads, int payloadLength, boolean storeOffsets, int offsetLength) {
    this.curDoc = doc;
    this.curStorePayloads = storePayloads;
    this.curPayloadLength = payloadLength;
+    this.curStoreOffsets = storeOffsets;
+    this.curOffsetLength = offsetLength;
    this.curFreqPointer = freqOutput.getFilePointer();
    if (proxOutput != null)
      this.curProxPointer = proxOutput.getFilePointer();
@ -116,6 +120,12 @@ public class Lucene40SkipListWriter extends MultiLevelSkipListWriter {
      // current field does not store payloads
      skipBuffer.writeVInt(curDoc - lastSkipDoc[level]);
    }
+
+    // TODO: not sure it really helps to shove this somewhere else if its the same as the last skip
+    if (curStoreOffsets) {
+      skipBuffer.writeVInt(curOffsetLength);
+    }
+
    skipBuffer.writeVInt((int) (curFreqPointer - lastSkipFreqPointer[level]));
    skipBuffer.writeVInt((int) (curProxPointer - lastSkipProxPointer[level]));

--- a/lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java
+++ b/lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java
@ -548,8 +548,9 @@ class SimpleTextFieldsReader extends FieldsProducer {
          UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+DOC.length, scratch.length-DOC.length, scratchUTF16);
          int docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
          visitedDocs.set(docID);
-        } else if (StringHelper.startsWith(scratch, POS)) {
-          totalTermFreq++;
+        } else if (StringHelper.startsWith(scratch, FREQ)) {
+          UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+FREQ.length, scratch.length-FREQ.length, scratchUTF16);
+          totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
        } else if (StringHelper.startsWith(scratch, TERM)) {
          if (lastDocsStart != -1) {
            b.add(Util.toIntsRef(lastTerm, scratchIntsRef), new PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>(lastDocsStart,
--- a/lucene/src/java/org/apache/lucene/index/FieldInfos.java
+++ b/lucene/src/java/org/apache/lucene/index/FieldInfos.java
@ -404,7 +404,7 @@ public final class FieldInfos implements Iterable<FieldInfo> {

  public boolean hasNorms() {
    for (FieldInfo fi : this) {
-      if (fi.isIndexed && !fi.omitNorms) {
+      if (fi.normsPresent()) {
        return true;
      }
    }
--- a/lucene/src/java/org/apache/lucene/index/IndexReader.java
+++ b/lucene/src/java/org/apache/lucene/index/IndexReader.java
@ -921,13 +921,7 @@ public abstract class IndexReader implements Closeable {
   *  If this method returns an empty array, that means this
   *  reader is a null reader (for example a MultiReader
   *  that has no sub readers).
-   *  <p>
-   *  NOTE: You should not try using sub-readers returned by
-   *  this method to make any changes (deleteDocument,
-   *  etc.). While this might succeed for one composite reader
-   *  (like MultiReader), it will most likely lead to index
-   *  corruption for other readers (like DirectoryReader obtained
-   *  through {@link #open}. Use the parent reader directly. */
+   */
  public IndexReader[] getSequentialSubReaders() {
    ensureOpen();
    return null;
--- a/lucene/src/java/org/apache/lucene/search/CollectionStatistics.java
+++ b/lucene/src/java/org/apache/lucene/search/CollectionStatistics.java
@ -32,6 +32,10 @@ public class CollectionStatistics {
  private final long sumDocFreq;
  
  public CollectionStatistics(String field, int maxDoc, int docCount, long sumTotalTermFreq, long sumDocFreq) {
+    assert maxDoc >= 0;
+    assert docCount >= -1 && docCount <= maxDoc; // #docs with field must be <= #docs
+    assert sumDocFreq >= -1;
+    assert sumTotalTermFreq == -1 || sumTotalTermFreq >= sumDocFreq; // #positions must be >= #postings
    this.field = field;
    this.maxDoc = maxDoc;
    this.docCount = docCount;
--- a/lucene/src/java/org/apache/lucene/search/TermStatistics.java
+++ b/lucene/src/java/org/apache/lucene/search/TermStatistics.java
@ -29,6 +29,8 @@ public class TermStatistics {
  private final long totalTermFreq;
  
  public TermStatistics(BytesRef term, int docFreq, long totalTermFreq) {
+    assert docFreq >= 0;
+    assert totalTermFreq == -1 || totalTermFreq >= docFreq; // #positions must be >= #postings
    this.term = term;
    this.docFreq = docFreq;
    this.totalTermFreq = totalTermFreq;
--- a/lucene/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
+++ b/lucene/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
@ -87,6 +87,8 @@ public abstract class SimilarityBase extends Similarity {
  /** Fills all member fields defined in {@code BasicStats} in {@code stats}. 
   *  Subclasses can override this method to fill additional stats. */
  protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
+    // #positions(field) must be >= #positions(term)
+    assert collectionStats.sumTotalTermFreq() == -1 || collectionStats.sumTotalTermFreq() >= termStats.totalTermFreq();
    int numberOfDocuments = collectionStats.maxDoc();
    
    int docFreq = termStats.docFreq();
--- a/lucene/src/java/org/apache/lucene/search/spans/FieldMaskingSpanQuery.java
+++ b/lucene/src/java/org/apache/lucene/search/spans/FieldMaskingSpanQuery.java
@ -70,8 +70,9 @@ import org.apache.lucene.util.ToStringUtils;
 * and 'jones' in position 1). </p>
 * 
 * <p>Note: as {@link #getField()} returns the masked field, scoring will be 
- * done using the norms of the field name supplied. This may lead to unexpected
- * scoring behaviour.</p>
+ * done using the Similarity and collection statistics of the field name supplied,
+ * but with the term statistics of the real field. This may lead to exceptions,
+ * poor performance, and unexpected scoring behaviour.</p>
 */
 public class FieldMaskingSpanQuery extends SpanQuery {
  private SpanQuery maskedQuery;
--- a/lucene/src/java/org/apache/lucene/store/CompoundFileWriter.java
+++ b/lucene/src/java/org/apache/lucene/store/CompoundFileWriter.java
@ -255,10 +255,8 @@ final class CompoundFileWriter implements Closeable{
      assert !seenIDs.contains(id): "file=\"" + name + "\" maps to id=\"" + id + "\", which was already written";
      seenIDs.add(id);
      final DirectCFSIndexOutput out;
-      if (outputTaken.compareAndSet(false, true)) {
+      if ((outputLocked = outputTaken.compareAndSet(false, true))) {
        out = new DirectCFSIndexOutput(getOutput(), entry, false);
-        outputLocked = true;
-        success = true;
      } else {
        entry.dir = this.directory;
        if (directory.fileExists(name)) {
--- a/lucene/src/java/org/apache/lucene/util/ReaderUtil.java
+++ b/lucene/src/java/org/apache/lucene/util/ReaderUtil.java
@ -120,42 +120,6 @@ public final class ReaderUtil {

    protected abstract void add(int base, IndexReader r) throws IOException;
  }
-
-  /**
-   * Returns sub IndexReader that contains the given document id.
-   *    
-   * @param doc id of document
-   * @param reader parent reader
-   * @return sub reader of parent which contains the specified doc id
-   */
-  public static IndexReader subReader(int doc, IndexReader reader) {
-    List<IndexReader> subReadersList = new ArrayList<IndexReader>();
-    ReaderUtil.gatherSubReaders(subReadersList, reader);
-    IndexReader[] subReaders = subReadersList
-        .toArray(new IndexReader[subReadersList.size()]);
-    int[] docStarts = new int[subReaders.length];
-    int maxDoc = 0;
-    for (int i = 0; i < subReaders.length; i++) {
-      docStarts[i] = maxDoc;
-      maxDoc += subReaders[i].maxDoc();
-    }
-    return subReaders[subIndex(doc, docStarts)];
-  }
-  
-  /**
-   * Returns sub-reader subIndex from reader.
-   * 
-   * @param reader parent reader
-   * @param subIndex index of desired sub reader
-   * @return the subreader at subIndex
-   */
-  public static IndexReader subReader(IndexReader reader, int subIndex) {
-    List<IndexReader> subReadersList = new ArrayList<IndexReader>();
-    ReaderUtil.gatherSubReaders(subReadersList, reader);
-    IndexReader[] subReaders = subReadersList
-        .toArray(new IndexReader[subReadersList.size()]);
-    return subReaders[subIndex];
-  }
  
  public static ReaderContext buildReaderContext(IndexReader reader) {
    return new ReaderContextBuilder(reader).build();
--- a/lucene/src/java/org/apache/lucene/util/SentinelIntSet.java
+++ b/lucene/src/java/org/apache/lucene/util/SentinelIntSet.java
@ -96,13 +96,13 @@ public class SentinelIntSet {
  public int put(int key) {
    int s = find(key);
    if (s < 0) {
+      count++;
      if (count >= rehashCount) {
        rehash();
        s = getSlot(key);
      } else {
        s = -s-1;
      }
-      count++;
      keys[s] = key;
    }
    return s;
--- a/lucene/src/java/org/apache/lucene/util/fst/ByteSequenceOutputs.java
+++ b/lucene/src/java/org/apache/lucene/util/fst/ByteSequenceOutputs.java
@ -32,12 +32,13 @@ import org.apache.lucene.util.BytesRef;
 public final class ByteSequenceOutputs extends Outputs<BytesRef> {

  private final static BytesRef NO_OUTPUT = new BytesRef();
+  private final static ByteSequenceOutputs singleton = new ByteSequenceOutputs();

  private ByteSequenceOutputs() {
  }

  public static ByteSequenceOutputs getSingleton() {
-    return new ByteSequenceOutputs();
+    return singleton;
  }

  @Override
--- a/lucene/src/java/org/apache/lucene/util/fst/IntSequenceOutputs.java
+++ b/lucene/src/java/org/apache/lucene/util/fst/IntSequenceOutputs.java
@ -32,12 +32,13 @@ import org.apache.lucene.util.IntsRef;
 public final class IntSequenceOutputs extends Outputs<IntsRef> {

  private final static IntsRef NO_OUTPUT = new IntsRef();
+  private final static IntSequenceOutputs singleton = new IntSequenceOutputs();

  private IntSequenceOutputs() {
  }

  public static IntSequenceOutputs getSingleton() {
-    return new IntSequenceOutputs();
+    return singleton;
  }

  @Override
--- a/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/src/test-framework/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@ -17,6 +17,7 @@ package org.apache.lucene.analysis;
 * limitations under the License.
 */

+import java.io.Reader;
 import java.io.StringReader;
 import java.io.IOException;
 import java.util.ArrayList;
@ -289,8 +290,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
      }
    }
  };
-
+  
  public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength) throws IOException {
+    checkRandomData(random, a, iterations, maxWordLength, random.nextBoolean());
+  }
+
+  public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter) throws IOException {
    for (int i = 0; i < iterations; i++) {
      String text;
      switch(_TestUtil.nextInt(random, 0, 4)) {
@ -311,7 +316,9 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
        System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
      }

-      TokenStream ts = a.tokenStream("dummy", new StringReader(text));
+      int remainder = random.nextInt(10);
+      Reader reader = new StringReader(text);
+      TokenStream ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
      assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class));
      CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
      OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
@ -339,30 +346,38 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
        if (VERBOSE) {
          System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis");
        }
+        reader = new StringReader(text);
+        ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
        if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
          // offset + pos + type
-          assertAnalyzesToReuse(a, text, 
+          assertTokenStreamContents(ts, 
            tokens.toArray(new String[tokens.size()]),
            toIntArray(startOffsets),
            toIntArray(endOffsets),
            types.toArray(new String[types.size()]),
-            toIntArray(positions));
+            toIntArray(positions),
+            text.length());
        } else if (posIncAtt != null && offsetAtt != null) {
          // offset + pos
-          assertAnalyzesToReuse(a, text, 
+          assertTokenStreamContents(ts, 
              tokens.toArray(new String[tokens.size()]),
              toIntArray(startOffsets),
              toIntArray(endOffsets),
-              toIntArray(positions));
+              null,
+              toIntArray(positions),
+              text.length());
        } else if (offsetAtt != null) {
          // offset
-          assertAnalyzesToReuse(a, text, 
+          assertTokenStreamContents(ts, 
              tokens.toArray(new String[tokens.size()]),
              toIntArray(startOffsets),
-              toIntArray(endOffsets));
+              toIntArray(endOffsets),
+              null,
+              null,
+              text.length());
        } else {
          // terms only
-          assertAnalyzesToReuse(a, text, 
+          assertTokenStreamContents(ts, 
              tokens.toArray(new String[tokens.size()]));
        }
      }
--- a/lucene/src/test-framework/java/org/apache/lucene/analysis/MockCharFilter.java
+++ b/lucene/src/test-framework/java/org/apache/lucene/analysis/MockCharFilter.java
@ -0,0 +1,100 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+// the purpose of this charfilter is to send offsets out of bounds
+// if the analyzer doesn't use correctOffset or does incorrect offset math.
+class MockCharFilter extends CharStream {
+  final Reader in;
+  final int remainder;
+  
+  // for testing only
+  public MockCharFilter(Reader in, int remainder) {
+    this.in = in;
+    this.remainder = remainder;
+    assert remainder >= 0 && remainder < 10 : "invalid parameter";
+  }
+
+  @Override
+  public void close() throws IOException {
+    in.close();
+  }
+  
+  int currentOffset = -1;
+  int delta = 0;
+  int bufferedCh = -1;
+  
+  @Override
+  public int read() throws IOException {
+    // we have a buffered character, add an offset correction and return it
+    if (bufferedCh >= 0) {
+      int ch = bufferedCh;
+      bufferedCh = -1;
+      currentOffset++;
+      
+      addOffCorrectMap(currentOffset+delta, delta-1);
+      delta--;
+      return ch;
+    }
+    
+    // otherwise actually read one    
+    int ch = in.read();
+    if (ch < 0)
+      return ch;
+    
+    currentOffset++;
+    if ((ch % 10) != remainder || Character.isHighSurrogate((char)ch) || Character.isLowSurrogate((char)ch)) {
+      return ch;
+    }
+    
+    // we will double this character, so buffer it.
+    bufferedCh = ch;
+    return ch;
+  }
+
+  @Override
+  public int read(char[] cbuf, int off, int len) throws IOException {
+    int numRead = 0;
+    for (int i = off; i < off + len; i++) {
+      int c = read();
+      if (c == -1) break;
+      cbuf[i] = (char) c;
+      numRead++;
+    }
+    return numRead == 0 ? -1 : numRead;
+  }
+
+  @Override
+  public int correctOffset(int currentOff) {
+    SortedMap<Integer,Integer> subMap = corrections.subMap(0, currentOff+1);
+    int ret = subMap.isEmpty() ? currentOff : currentOff + subMap.get(subMap.lastKey());
+    assert ret >= 0 : "currentOff=" + currentOff + ",diff=" + (ret-currentOff);
+    return ret;
+  }
+  
+  protected void addOffCorrectMap(int off, int cumulativeDiff) {
+    corrections.put(off, cumulativeDiff);
+  }
+  
+  TreeMap<Integer,Integer> corrections = new TreeMap<Integer,Integer>();
+}
--- a/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWFieldsWriter.java
+++ b/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWFieldsWriter.java
@ -137,7 +137,7 @@ class PreFlexRWFieldsWriter extends FieldsConsumer {
        }

        if ((++df % termsOut.skipInterval) == 0) {
-          skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength);
+          skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength, false, 0);
          skipListWriter.bufferSkip(df);
        }

--- a/lucene/src/test-framework/java/org/apache/lucene/search/ShardSearchingTestBase.java
+++ b/lucene/src/test-framework/java/org/apache/lucene/search/ShardSearchingTestBase.java
@ -268,8 +268,19 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase {
            assert subStats != null;
          }
        
-          docFreq += subStats.docFreq();
-          totalTermFreq += subStats.totalTermFreq();
+          int nodeDocFreq = subStats.docFreq();
+          if (docFreq >= 0 && nodeDocFreq >= 0) {
+            docFreq += nodeDocFreq;
+          } else {
+            docFreq = -1;
+          }
+          
+          long nodeTotalTermFreq = subStats.totalTermFreq();
+          if (totalTermFreq >= 0 && nodeTotalTermFreq >= 0) {
+            totalTermFreq += nodeTotalTermFreq;
+          } else {
+            totalTermFreq = -1;
+          }
        }

        return new TermStatistics(term.bytes(), docFreq, totalTermFreq);
@ -299,9 +310,29 @@ public abstract class ShardSearchingTestBase extends LuceneTestCase {
          // Collection stats are pre-shared on reopen, so,
          // we better not have a cache miss:
          assert nodeStats != null: "myNodeID=" + myNodeID + " nodeID=" + nodeID + " version=" + nodeVersions[nodeID] + " field=" + field;
-          docCount += nodeStats.docCount();
-          sumTotalTermFreq += nodeStats.sumTotalTermFreq();
-          sumDocFreq += nodeStats.sumDocFreq();
+          
+          int nodeDocCount = nodeStats.docCount();
+          if (docCount >= 0 && nodeDocCount >= 0) {
+            docCount += nodeDocCount;
+          } else {
+            docCount = -1;
+          }
+          
+          long nodeSumTotalTermFreq = nodeStats.sumTotalTermFreq();
+          if (sumTotalTermFreq >= 0 && nodeSumTotalTermFreq >= 0) {
+            sumTotalTermFreq += nodeSumTotalTermFreq;
+          } else {
+            sumTotalTermFreq = -1;
+          }
+          
+          long nodeSumDocFreq = nodeStats.sumDocFreq();
+          if (sumDocFreq >= 0 && nodeSumDocFreq >= 0) {
+            sumDocFreq += nodeSumDocFreq;
+          } else {
+            sumDocFreq = -1;
+          }
+          
+          assert nodeStats.maxDoc() >= 0;
          maxDoc += nodeStats.maxDoc();
        }

--- a/lucene/src/test-framework/java/org/apache/lucene/util/LuceneTestCase.java
+++ b/lucene/src/test-framework/java/org/apache/lucene/util/LuceneTestCase.java
@ -283,7 +283,8 @@ public abstract class LuceneTestCase extends Assert {
    int randomVal = random.nextInt(10);
    
    if ("Lucene3x".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal < 2)) { // preflex-only setup
-      codec = new PreFlexRWCodec();
+      codec = Codec.forName("Lucene3x");
+      assert (codec instanceof PreFlexRWCodec) : "fix your classpath to have tests-framework.jar before lucene-core.jar";
      PREFLEX_IMPERSONATION_IS_ACTIVE = true;
    } else if ("SimpleText".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 9)) {
      codec = new SimpleTextCodec();
--- a/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java
+++ b/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java
@ -249,7 +249,42 @@ public class _TestUtil {
    }
  }
  
-  // TODO: make this more evil
+  private static final String[] HTML_CHAR_ENTITIES = {
+      "AElig", "Aacute", "Acirc", "Agrave", "Alpha", "AMP", "Aring", "Atilde",
+      "Auml", "Beta", "COPY", "Ccedil", "Chi", "Dagger", "Delta", "ETH",
+      "Eacute", "Ecirc", "Egrave", "Epsilon", "Eta", "Euml", "Gamma", "GT",
+      "Iacute", "Icirc", "Igrave", "Iota", "Iuml", "Kappa", "Lambda", "LT",
+      "Mu", "Ntilde", "Nu", "OElig", "Oacute", "Ocirc", "Ograve", "Omega",
+      "Omicron", "Oslash", "Otilde", "Ouml", "Phi", "Pi", "Prime", "Psi",
+      "QUOT", "REG", "Rho", "Scaron", "Sigma", "THORN", "Tau", "Theta",
+      "Uacute", "Ucirc", "Ugrave", "Upsilon", "Uuml", "Xi", "Yacute", "Yuml",
+      "Zeta", "aacute", "acirc", "acute", "aelig", "agrave", "alefsym",
+      "alpha", "amp", "and", "ang", "apos", "aring", "asymp", "atilde",
+      "auml", "bdquo", "beta", "brvbar", "bull", "cap", "ccedil", "cedil",
+      "cent", "chi", "circ", "clubs", "cong", "copy", "crarr", "cup",
+      "curren", "dArr", "dagger", "darr", "deg", "delta", "diams", "divide",
+      "eacute", "ecirc", "egrave", "empty", "emsp", "ensp", "epsilon",
+      "equiv", "eta", "eth", "euml", "euro", "exist", "fnof", "forall",
+      "frac12", "frac14", "frac34", "frasl", "gamma", "ge", "gt", "hArr",
+      "harr", "hearts", "hellip", "iacute", "icirc", "iexcl", "igrave",
+      "image", "infin", "int", "iota", "iquest", "isin", "iuml", "kappa",
+      "lArr", "lambda", "lang", "laquo", "larr", "lceil", "ldquo", "le",
+      "lfloor", "lowast", "loz", "lrm", "lsaquo", "lsquo", "lt", "macr",
+      "mdash", "micro", "middot", "minus", "mu", "nabla", "nbsp", "ndash",
+      "ne", "ni", "not", "notin", "nsub", "ntilde", "nu", "oacute", "ocirc",
+      "oelig", "ograve", "oline", "omega", "omicron", "oplus", "or", "ordf",
+      "ordm", "oslash", "otilde", "otimes", "ouml", "para", "part", "permil",
+      "perp", "phi", "pi", "piv", "plusmn", "pound", "prime", "prod", "prop",
+      "psi", "quot", "rArr", "radic", "rang", "raquo", "rarr", "rceil",
+      "rdquo", "real", "reg", "rfloor", "rho", "rlm", "rsaquo", "rsquo",
+      "sbquo", "scaron", "sdot", "sect", "shy", "sigma", "sigmaf", "sim",
+      "spades", "sub", "sube", "sum", "sup", "sup1", "sup2", "sup3", "supe",
+      "szlig", "tau", "there4", "theta", "thetasym", "thinsp", "thorn",
+      "tilde", "times", "trade", "uArr", "uacute", "uarr", "ucirc", "ugrave",
+      "uml", "upsih", "upsilon", "uuml", "weierp", "xi", "yacute", "yen",
+      "yuml", "zeta", "zwj", "zwnj"
+  };
+  
  public static String randomHtmlishString(Random random, int numElements) {
    final int end = random.nextInt(numElements);
    if (end == 0) {
@ -258,17 +293,80 @@ public class _TestUtil {
    }
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < end; i++) {
-      int val = random.nextInt(10);
+      int val = random.nextInt(25);
      switch(val) {
        case 0: sb.append("<p>"); break;
-        case 1: sb.append("</p>"); break;
-        case 2: sb.append("<!--"); break;
-        case 3: sb.append("-->"); break;
-        case 4: sb.append("&#"); break;
-        case 5: sb.append(";"); break;
-        case 6: sb.append((char)_TestUtil.nextInt(random, '0', '9')); break;
-        default:
-          sb.append((char)_TestUtil.nextInt(random, 'a', 'z'));
+        case 1: {
+          sb.append("<");
+          sb.append("    ".substring(nextInt(random, 0, 4)));
+          sb.append(randomSimpleString(random));
+          for (int j = 0 ; j < nextInt(random, 0, 10) ; ++j) {
+            sb.append(' ');
+            sb.append(randomSimpleString(random));
+            sb.append(" ".substring(nextInt(random, 0, 1)));
+            sb.append('=');
+            sb.append(" ".substring(nextInt(random, 0, 1)));
+            sb.append("\"".substring(nextInt(random, 0, 1)));
+            sb.append(randomSimpleString(random));
+            sb.append("\"".substring(nextInt(random, 0, 1)));
+          }
+          sb.append("    ".substring(nextInt(random, 0, 4)));
+          sb.append("/".substring(nextInt(random, 0, 1)));
+          sb.append(">".substring(nextInt(random, 0, 1)));
+          break;
+        }
+        case 2: {
+          sb.append("</");
+          sb.append("    ".substring(nextInt(random, 0, 4)));
+          sb.append(randomSimpleString(random));
+          sb.append("    ".substring(nextInt(random, 0, 4)));
+          sb.append(">".substring(nextInt(random, 0, 1)));
+          break;
+        }
+        case 3: sb.append(">"); break;
+        case 4: sb.append("</p>"); break;
+        case 5: sb.append("<!--"); break;
+        case 6: sb.append("<!--#"); break;
+        case 7: sb.append("<script><!-- f('"); break;
+        case 8: sb.append("</script>"); break;
+        case 9: sb.append("<?"); break;
+        case 10: sb.append("?>"); break;
+        case 11: sb.append("\""); break;
+        case 12: sb.append("\\\""); break;
+        case 13: sb.append("'"); break;
+        case 14: sb.append("\\'"); break;
+        case 15: sb.append("-->"); break;
+        case 16: {
+          sb.append("&");
+          switch(nextInt(random, 0, 2)) {
+            case 0: sb.append(randomSimpleString(random)); break;
+            case 1: sb.append(HTML_CHAR_ENTITIES[random.nextInt(HTML_CHAR_ENTITIES.length)]); break;
+          }
+          sb.append(";".substring(nextInt(random, 0, 1)));
+          break;
+        }
+        case 17: {
+          sb.append("&#");
+          if (0 == nextInt(random, 0, 1)) {
+            sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1));
+            sb.append(";".substring(nextInt(random, 0, 1)));
+          }
+          break;
+        } 
+        case 18: {
+          sb.append("&#x");
+          if (0 == nextInt(random, 0, 1)) {
+            sb.append(Integer.toString(nextInt(random, 0, Integer.MAX_VALUE - 1), 16));
+            sb.append(";".substring(nextInt(random, 0, 1)));
+          }
+          break;
+        }
+          
+        case 19: sb.append(";"); break;
+        case 20: sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1)); break;
+        case 21: sb.append("\n");
+        case 22: sb.append("          ".substring(nextInt(random, 0, 10)));
+        default: sb.append(randomSimpleString(random));
      }
    }
    return sb.toString();
--- a/lucene/src/test/org/apache/lucene/analysis/TestMockCharFilter.java
+++ b/lucene/src/test/org/apache/lucene/analysis/TestMockCharFilter.java
@ -0,0 +1,58 @@
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class TestMockCharFilter extends BaseTokenStreamTestCase {
+  
+  public void test() throws IOException {
+    Analyzer analyzer = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }
+
+      @Override
+      protected Reader initReader(Reader reader) {
+        return new MockCharFilter(CharReader.get(reader), 7);
+      }
+    };
+    
+    assertAnalyzesTo(analyzer, "ab",
+        new String[] { "aab" },
+        new int[] { 0 },
+        new int[] { 2 }
+    );
+    
+    assertAnalyzesTo(analyzer, "aba",
+        new String[] { "aabaa" },
+        new int[] { 0 },
+        new int[] { 3 }
+    );
+    
+    assertAnalyzesTo(analyzer, "abcdefga",
+        new String[] { "aabcdefgaa" },
+        new int[] { 0 },
+        new int[] { 8 }
+    );
+  }
+}
--- a/lucene/src/test/org/apache/lucene/index/TestPostingsOffsets.java
+++ b/lucene/src/test/org/apache/lucene/index/TestPostingsOffsets.java
@ -22,29 +22,46 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Map;

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CannedAnalyzer;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockPayloadAnalyzer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.NumericField;
+import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
+import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.FieldCache;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.English;
 import org.apache.lucene.util.LuceneTestCase;
-import org.junit.Assume;
+import org.apache.lucene.util._TestUtil;

 public class TestPostingsOffsets extends LuceneTestCase {
+  IndexWriterConfig iwc;
+  
+  public void setUp() throws Exception {
+    super.setUp();
+    // Currently only SimpleText and Lucene40 can index offsets into postings:
+    assumeTrue("codec does not support offsets", Codec.getDefault().getName().equals("SimpleText") || Codec.getDefault().getName().equals("Lucene40"));
+    iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random));
+    
+    if (Codec.getDefault().getName().equals("Lucene40")) {
+      // pulsing etc are not implemented
+      iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
+    }
+  }

  public void testBasic() throws Exception {
-
-    // Currently only SimpleText can index offsets into postings:
-    Assume.assumeTrue(Codec.getDefault().getName().equals("SimpleText"));
-
    Directory dir = newDirectory();
-    RandomIndexWriter w = new RandomIndexWriter(random, dir);
+    
+    RandomIndexWriter w = new RandomIndexWriter(random, dir, iwc);
    Document doc = new Document();

    FieldType ft = new FieldType(TextField.TYPE_UNSTORED);
@ -94,16 +111,117 @@ public class TestPostingsOffsets extends LuceneTestCase {
    r.close();
    dir.close();
  }
+  
+  public void testSkipping() throws Exception {
+    doTestNumbers(false);
+  }
+  
+  public void testPayloads() throws Exception {
+    doTestNumbers(true);
+  }
+  
+  public void doTestNumbers(boolean withPayloads) throws Exception {
+    Directory dir = newDirectory();
+    Analyzer analyzer = withPayloads ? new MockPayloadAnalyzer() : new MockAnalyzer(random);
+    iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
+    if (Codec.getDefault().getName().equals("Lucene40")) {
+      // pulsing etc are not implemented
+      iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
+    }
+    iwc.setMergePolicy(newLogMergePolicy()); // will rely on docids a bit for skipping
+    RandomIndexWriter w = new RandomIndexWriter(random, dir, iwc);
+    
+    FieldType ft = new FieldType(TextField.TYPE_STORED);
+    ft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+    if (random.nextBoolean()) {
+      ft.setStoreTermVectors(true);
+      ft.setStoreTermVectorOffsets(random.nextBoolean());
+      ft.setStoreTermVectorPositions(random.nextBoolean());
+    }
+    
+    int numDocs = atLeast(500);
+    for (int i = 0; i < numDocs; i++) {
+      Document doc = new Document();
+      doc.add(new Field("numbers", English.intToEnglish(i), ft));
+      doc.add(new Field("oddeven", (i % 2) == 0 ? "even" : "odd", ft));
+      doc.add(new StringField("id", "" + i));
+      w.addDocument(doc);
+    }
+    
+    IndexReader reader = w.getReader();
+    w.close();
+    
+    String terms[] = { "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "hundred" };
+    
+    for (String term : terms) {
+      DocsAndPositionsEnum dp = MultiFields.getTermPositionsEnum(reader, null, "numbers", new BytesRef(term), true);
+      int doc;
+      while((doc = dp.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
+        String storedNumbers = reader.document(doc).get("numbers");
+        int freq = dp.freq();
+        for (int i = 0; i < freq; i++) {
+          dp.nextPosition();
+          int start = dp.startOffset();
+          assert start >= 0;
+          int end = dp.endOffset();
+          assert end >= 0 && end >= start;
+          // check that the offsets correspond to the term in the src text
+          assertTrue(storedNumbers.substring(start, end).equals(term));
+          if (withPayloads) {
+            // check that we have a payload and it starts with "pos"
+            assertTrue(dp.hasPayload());
+            BytesRef payload = dp.getPayload();
+            assertTrue(payload.utf8ToString().startsWith("pos:"));
+          } // note: withPayloads=false doesnt necessarily mean we dont have them from MockAnalyzer!
+        }
+      }
+    }
+    
+    // check we can skip correctly
+    int numSkippingTests = atLeast(50);
+    
+    for (int j = 0; j < numSkippingTests; j++) {
+      int num = _TestUtil.nextInt(random, 100, Math.min(numDocs-1, 999));
+      DocsAndPositionsEnum dp = MultiFields.getTermPositionsEnum(reader, null, "numbers", new BytesRef("hundred"), true);
+      int doc = dp.advance(num);
+      assertEquals(num, doc);
+      int freq = dp.freq();
+      for (int i = 0; i < freq; i++) {
+        String storedNumbers = reader.document(doc).get("numbers");
+        dp.nextPosition();
+        int start = dp.startOffset();
+        assert start >= 0;
+        int end = dp.endOffset();
+        assert end >= 0 && end >= start;
+        // check that the offsets correspond to the term in the src text
+        assertTrue(storedNumbers.substring(start, end).equals("hundred"));
+        if (withPayloads) {
+          // check that we have a payload and it starts with "pos"
+          assertTrue(dp.hasPayload());
+          BytesRef payload = dp.getPayload();
+          assertTrue(payload.utf8ToString().startsWith("pos:"));
+        } // note: withPayloads=false doesnt necessarily mean we dont have them from MockAnalyzer!
+      }
+    }
+    
+    // check that other fields (without offsets) work correctly
+    
+    for (int i = 0; i < numDocs; i++) {
+      DocsEnum dp = MultiFields.getTermDocsEnum(reader, null, "id", new BytesRef("" + i), false);
+      assertEquals(i, dp.nextDoc());
+      assertEquals(DocIdSetIterator.NO_MORE_DOCS, dp.nextDoc());
+    }
+    
+    reader.close();
+    dir.close();
+  }

  public void testRandom() throws Exception {
-    // Currently only SimpleText can index offsets into postings:
-    Assume.assumeTrue(Codec.getDefault().getName().equals("SimpleText"));
-
    // token -> docID -> tokens
    final Map<String,Map<Integer,List<Token>>> actualTokens = new HashMap<String,Map<Integer,List<Token>>>();

    Directory dir = newDirectory();
-    RandomIndexWriter w = new RandomIndexWriter(random, dir);
+    RandomIndexWriter w = new RandomIndexWriter(random, dir, iwc);

    final int numDocs = atLeast(20);
    //final int numDocs = atLeast(5);
--- a/lucene/src/test/org/apache/lucene/search/spans/TestFieldMaskingSpanQuery.java
+++ b/lucene/src/test/org/apache/lucene/search/spans/TestFieldMaskingSpanQuery.java
@ -31,6 +31,7 @@ import org.apache.lucene.search.CheckHits;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.QueryUtils;
+import org.apache.lucene.search.similarities.TFIDFSimilarity;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.LuceneTestCase;
 import org.junit.AfterClass;
@ -240,6 +241,8 @@ public class TestFieldMaskingSpanQuery extends LuceneTestCase {
  }
  
  public void testSimple2() throws Exception {
+    assumeTrue("Broken scoring: LUCENE-3723", 
+        searcher.getSimilarityProvider().get("id") instanceof TFIDFSimilarity);
    SpanQuery q1 = new SpanTermQuery(new Term("gender", "female"));
    SpanQuery q2 = new SpanTermQuery(new Term("last", "smith"));
    SpanQuery q = new SpanNearQuery(new SpanQuery[]
@ -310,6 +313,8 @@ public class TestFieldMaskingSpanQuery extends LuceneTestCase {
  }
  
  public void testSpans2() throws Exception {
+    assumeTrue("Broken scoring: LUCENE-3723", 
+        searcher.getSimilarityProvider().get("id") instanceof TFIDFSimilarity);
    SpanQuery qA1 = new SpanTermQuery(new Term("gender", "female"));
    SpanQuery qA2 = new SpanTermQuery(new Term("first",  "james"));
    SpanQuery qA  = new SpanOrQuery(qA1, new FieldMaskingSpanQuery(qA2, "gender"));
--- a/lucene/src/test/org/apache/lucene/util/TestSentinelIntSet.java
+++ b/lucene/src/test/org/apache/lucene/util/TestSentinelIntSet.java
@ -20,6 +20,8 @@ package org.apache.lucene.util;

 import org.junit.Test;

+import java.util.HashSet;
+
 /**
 *
 *
@ -45,4 +47,32 @@ public class TestSentinelIntSet extends LuceneTestCase {
    assertEquals(20, set.size());
    assertEquals(24, set.rehashCount);
  }
+  
+
+  @Test
+  public void testRandom() throws Exception {
+    for (int i=0; i<10000; i++) {
+      int initSz = random.nextInt(20);
+      int num = random.nextInt(30);
+      int maxVal = (random.nextBoolean() ? random.nextInt(50) : random.nextInt(Integer.MAX_VALUE)) + 1;
+
+      HashSet<Integer> a = new HashSet<Integer>(initSz);
+      SentinelIntSet b = new SentinelIntSet(initSz, -1);
+      
+      for (int j=0; j<num; j++) {
+        int val = random.nextInt(maxVal);
+        boolean exists = !a.add(val);
+        boolean existsB = b.exists(val);
+        assertEquals(exists, existsB);
+        int slot = b.find(val);
+        assertEquals(exists, slot>=0);
+        b.put(val);
+        
+        assertEquals(a.size(), b.size());
+      }
+      
+    }
+
+  }
+  
 }
--- a/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java
+++ b/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java
@ -1055,6 +1055,50 @@ public class TestFSTs extends LuceneTestCase {
    }
  }

+  // NOTE: this test shows a case where our current builder
+  // fails to produce minimal FST:
+  /*
+  public void test3() throws Exception {
+    final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
+    Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
+    IntsRef scratchIntsRef = new IntsRef();
+    builder.add(Util.toIntsRef(new BytesRef("aa$"), scratchIntsRef), outputs.get(0));
+    builder.add(Util.toIntsRef(new BytesRef("aab$"), scratchIntsRef), 1L);
+    builder.add(Util.toIntsRef(new BytesRef("bbb$"), scratchIntsRef), 2L);
+    final FST<Long> fst = builder.finish();
+    //System.out.println("NODES " + fst.getNodeCount() + " ARCS " + fst.getArcCount());
+    // NOTE: we produce 7 nodes today
+    assertEquals(6, fst.getNodeCount());
+    // NOTE: we produce 8 arcs today
+    assertEquals(7, fst.getNodeCount());
+    //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
+    //Util.toDot(fst, w, false, false);
+    //w.close();
+  }
+  */
+
+  // NOTE: this test shows a case where our current builder
+  // fails to produce minimal FST:
+  /*
+  public void test4() throws Exception {
+    final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
+    Builder<BytesRef> builder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, outputs);
+    IntsRef scratchIntsRef = new IntsRef();
+    builder.add(Util.toIntsRef(new BytesRef("aa$"), scratchIntsRef), outputs.getNoOutput());
+    builder.add(Util.toIntsRef(new BytesRef("aab$"), scratchIntsRef), new BytesRef("1"));
+    builder.add(Util.toIntsRef(new BytesRef("bbb$"), scratchIntsRef), new BytesRef("11"));
+    final FST<BytesRef> fst = builder.finish();
+    //System.out.println("NODES " + fst.getNodeCount() + " ARCS " + fst.getArcCount());
+    // NOTE: we produce 7 nodes today
+    assertEquals(6, fst.getNodeCount());
+    // NOTE: we produce 8 arcs today
+    assertEquals(7, fst.getNodeCount());
+    //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
+    //Util.toDot(fst, w, false, false);
+    //w.close();
+  }
+  */
+
  // Build FST for all unique terms in the test line docs
  // file, up until a time limit
  public void testRealTerms() throws Exception {
--- a/modules/analysis/common/build.xml
+++ b/modules/analysis/common/build.xml
@ -31,14 +31,38 @@
  <target name="compile-core" depends="jflex-notice, common.compile-core"/>

  <target name="jflex" depends="jflex-check,clean-jflex,gen-uax29-supp-macros,
-                                jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,jflex-wiki-tokenizer"/>
+                                jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,
+                                jflex-wiki-tokenizer,jflex-HTMLStripCharFilter"/>

  <target name="gen-uax29-supp-macros">
    <subant target="gen-uax29-supp-macros">
       <fileset dir="../icu" includes="build.xml"/>
    </subant>
  </target>
-  
+
+  <target name="jflex-HTMLStripCharFilter"
+          depends="init,jflex-check,generate-jflex-html-char-entities"
+          if="jflex.present">
+    <taskdef classname="jflex.anttask.JFlexTask" name="jflex">
+      <classpath refid="jflex.classpath"/>
+    </taskdef>
+    <jflex file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex"
+           outdir="src/java/org/apache/lucene/analysis/charfilter"
+           nobak="on"/>
+    <!-- Remove the inappropriate JFlex-generated constructors -->
+    <replaceregexp file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java"
+                   match="/\*\*\s*\*\s*Creates a new scanner.*this\(new java\.io\.InputStreamReader\(in\)\);\s*\}"
+                   replace="" flags="sg"/>
+  </target>
+
+  <target name="generate-jflex-html-char-entities">
+    <exec dir="src/java/org/apache/lucene/analysis/charfilter"
+          output="src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex"
+          executable="${python.exe}" failonerror="true" logerror="true">
+      <arg value="htmlentity.py"/>
+    </exec>
+  </target>
+
  <target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
    <taskdef classname="jflex.anttask.JFlexTask" name="jflex">
      <classpath refid="jflex.classpath"/>
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java
@ -20,6 +20,8 @@ package org.apache.lucene.analysis.charfilter;
 import org.apache.lucene.analysis.CharStream;
 import org.apache.lucene.util.ArrayUtil;

+import java.util.Arrays;
+
 /**
 * Base utility class for implementing a {@link CharFilter}.
 * You subclass this, and then record mappings by calling
@ -71,6 +73,19 @@ public abstract class BaseCharFilter extends CharFilter {
      0 : diffs[size-1];
  }

+  /**
+   * <p>
+   *   Adds an offset correction mapping at the given output stream offset.
+   * </p>
+   * <p>
+   *   Assumption: the offset given with each successive call to this method
+   *   will not be smaller than the offset given at the previous invocation.
+   * </p>
+   *
+   * @param off The output stream offset at which to apply the correction
+   * @param cumulativeDiff The input offset is given by adding this
+   *                       to the output offset
+   */
  protected void addOffCorrectMap(int off, int cumulativeDiff) {
    if (offsets == null) {
      offsets = new int[64];
@ -80,7 +95,15 @@ public abstract class BaseCharFilter extends CharFilter {
      diffs = ArrayUtil.grow(diffs);
    }
    
-    offsets[size] = off;
-    diffs[size++] = cumulativeDiff; 
+    assert (size == 0 || off >= offsets[size])
+        : "Offset #" + size + "(" + off + ") is less than the last recorded offset "
+          + offsets[size] + "\n" + Arrays.toString(offsets) + "\n" + Arrays.toString(diffs);
+    
+    if (size == 0 || off != offsets[size - 1]) {
+      offsets[size] = off;
+      diffs[size++] = cumulativeDiff;
+    } else { // Overwrite the diff at the last recorded offset
+      diffs[size - 1] = cumulativeDiff;
+    }
  }
 }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex
@ -0,0 +1,153 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+CharacterEntities = ( "AElig" | "Aacute" | "Acirc" | "Agrave" | "Alpha"
+                    | "Aring" | "Atilde" | "Auml" | "Beta" | "Ccedil" | "Chi"
+                    | "Dagger" | "Delta" | "ETH" | "Eacute" | "Ecirc"
+                    | "Egrave" | "Epsilon" | "Eta" | "Euml" | "Gamma"
+                    | "Iacute" | "Icirc" | "Igrave" | "Iota" | "Iuml" | "Kappa"
+                    | "Lambda" | "Mu" | "Ntilde" | "Nu" | "OElig" | "Oacute"
+                    | "Ocirc" | "Ograve" | "Omega" | "Omicron" | "Oslash"
+                    | "Otilde" | "Ouml" | "Phi" | "Pi" | "Prime" | "Psi"
+                    | "Rho" | "Scaron" | "Sigma" | "THORN" | "Tau" | "Theta"
+                    | "Uacute" | "Ucirc" | "Ugrave" | "Upsilon" | "Uuml" | "Xi"
+                    | "Yacute" | "Yuml" | "Zeta" | "aacute" | "acirc" | "acute"
+                    | "aelig" | "agrave" | "alefsym" | "alpha" | "amp" | "AMP"
+                    | "and" | "ang" | "apos" | "aring" | "asymp" | "atilde"
+                    | "auml" | "bdquo" | "beta" | "brvbar" | "bull" | "cap"
+                    | "ccedil" | "cedil" | "cent" | "chi" | "circ" | "clubs"
+                    | "cong" | "copy" | "COPY" | "crarr" | "cup" | "curren"
+                    | "dArr" | "dagger" | "darr" | "deg" | "delta" | "diams"
+                    | "divide" | "eacute" | "ecirc" | "egrave" | "empty"
+                    | "emsp" | "ensp" | "epsilon" | "equiv" | "eta" | "eth"
+                    | "euml" | "euro" | "exist" | "fnof" | "forall" | "frac12"
+                    | "frac14" | "frac34" | "frasl" | "gamma" | "ge" | "gt"
+                    | "GT" | "hArr" | "harr" | "hearts" | "hellip" | "iacute"
+                    | "icirc" | "iexcl" | "igrave" | "image" | "infin" | "int"
+                    | "iota" | "iquest" | "isin" | "iuml" | "kappa" | "lArr"
+                    | "lambda" | "lang" | "laquo" | "larr" | "lceil" | "ldquo"
+                    | "le" | "lfloor" | "lowast" | "loz" | "lrm" | "lsaquo"
+                    | "lsquo" | "lt" | "LT" | "macr" | "mdash" | "micro"
+                    | "middot" | "minus" | "mu" | "nabla" | "nbsp" | "ndash"
+                    | "ne" | "ni" | "not" | "notin" | "nsub" | "ntilde" | "nu"
+                    | "oacute" | "ocirc" | "oelig" | "ograve" | "oline"
+                    | "omega" | "omicron" | "oplus" | "or" | "ordf" | "ordm"
+                    | "oslash" | "otilde" | "otimes" | "ouml" | "para" | "part"
+                    | "permil" | "perp" | "phi" | "pi" | "piv" | "plusmn"
+                    | "pound" | "prime" | "prod" | "prop" | "psi" | "quot"
+                    | "QUOT" | "rArr" | "radic" | "rang" | "raquo" | "rarr"
+                    | "rceil" | "rdquo" | "real" | "reg" | "REG" | "rfloor"
+                    | "rho" | "rlm" | "rsaquo" | "rsquo" | "sbquo" | "scaron"
+                    | "sdot" | "sect" | "shy" | "sigma" | "sigmaf" | "sim"
+                    | "spades" | "sub" | "sube" | "sum" | "sup" | "sup1"
+                    | "sup2" | "sup3" | "supe" | "szlig" | "tau" | "there4"
+                    | "theta" | "thetasym" | "thinsp" | "thorn" | "tilde"
+                    | "times" | "trade" | "uArr" | "uacute" | "uarr" | "ucirc"
+                    | "ugrave" | "uml" | "upsih" | "upsilon" | "uuml"
+                    | "weierp" | "xi" | "yacute" | "yen" | "yuml" | "zeta"
+                    | "zwj" | "zwnj" )
+%{
+  private static final Set<String> upperCaseVariantsAccepted
+      = new HashSet<String>(Arrays.asList("quot","copy","gt","lt","reg","amp"));
+  private static final CharArrayMap<Character> entityValues
+      = new CharArrayMap<Character>(Version.LUCENE_40, 253, false);
+  static {
+    String[] entities = {
+      "AElig", "\u00C6", "Aacute", "\u00C1", "Acirc", "\u00C2",
+      "Agrave", "\u00C0", "Alpha", "\u0391", "Aring", "\u00C5",
+      "Atilde", "\u00C3", "Auml", "\u00C4", "Beta", "\u0392",
+      "Ccedil", "\u00C7", "Chi", "\u03A7", "Dagger", "\u2021",
+      "Delta", "\u0394", "ETH", "\u00D0", "Eacute", "\u00C9",
+      "Ecirc", "\u00CA", "Egrave", "\u00C8", "Epsilon", "\u0395",
+      "Eta", "\u0397", "Euml", "\u00CB", "Gamma", "\u0393", "Iacute", "\u00CD",
+      "Icirc", "\u00CE", "Igrave", "\u00CC", "Iota", "\u0399",
+      "Iuml", "\u00CF", "Kappa", "\u039A", "Lambda", "\u039B", "Mu", "\u039C",
+      "Ntilde", "\u00D1", "Nu", "\u039D", "OElig", "\u0152",
+      "Oacute", "\u00D3", "Ocirc", "\u00D4", "Ograve", "\u00D2",
+      "Omega", "\u03A9", "Omicron", "\u039F", "Oslash", "\u00D8",
+      "Otilde", "\u00D5", "Ouml", "\u00D6", "Phi", "\u03A6", "Pi", "\u03A0",
+      "Prime", "\u2033", "Psi", "\u03A8", "Rho", "\u03A1", "Scaron", "\u0160",
+      "Sigma", "\u03A3", "THORN", "\u00DE", "Tau", "\u03A4", "Theta", "\u0398",
+      "Uacute", "\u00DA", "Ucirc", "\u00DB", "Ugrave", "\u00D9",
+      "Upsilon", "\u03A5", "Uuml", "\u00DC", "Xi", "\u039E",
+      "Yacute", "\u00DD", "Yuml", "\u0178", "Zeta", "\u0396",
+      "aacute", "\u00E1", "acirc", "\u00E2", "acute", "\u00B4",
+      "aelig", "\u00E6", "agrave", "\u00E0", "alefsym", "\u2135",
+      "alpha", "\u03B1", "amp", "\u0026", "and", "\u2227", "ang", "\u2220",
+      "apos", "\u0027", "aring", "\u00E5", "asymp", "\u2248",
+      "atilde", "\u00E3", "auml", "\u00E4", "bdquo", "\u201E",
+      "beta", "\u03B2", "brvbar", "\u00A6", "bull", "\u2022", "cap", "\u2229",
+      "ccedil", "\u00E7", "cedil", "\u00B8", "cent", "\u00A2", "chi", "\u03C7",
+      "circ", "\u02C6", "clubs", "\u2663", "cong", "\u2245", "copy", "\u00A9",
+      "crarr", "\u21B5", "cup", "\u222A", "curren", "\u00A4", "dArr", "\u21D3",
+      "dagger", "\u2020", "darr", "\u2193", "deg", "\u00B0", "delta", "\u03B4",
+      "diams", "\u2666", "divide", "\u00F7", "eacute", "\u00E9",
+      "ecirc", "\u00EA", "egrave", "\u00E8", "empty", "\u2205",
+      "emsp", "\u2003", "ensp", "\u2002", "epsilon", "\u03B5",
+      "equiv", "\u2261", "eta", "\u03B7", "eth", "\u00F0", "euml", "\u00EB",
+      "euro", "\u20AC", "exist", "\u2203", "fnof", "\u0192",
+      "forall", "\u2200", "frac12", "\u00BD", "frac14", "\u00BC",
+      "frac34", "\u00BE", "frasl", "\u2044", "gamma", "\u03B3", "ge", "\u2265",
+      "gt", "\u003E", "hArr", "\u21D4", "harr", "\u2194", "hearts", "\u2665",
+      "hellip", "\u2026", "iacute", "\u00ED", "icirc", "\u00EE",
+      "iexcl", "\u00A1", "igrave", "\u00EC", "image", "\u2111",
+      "infin", "\u221E", "int", "\u222B", "iota", "\u03B9", "iquest", "\u00BF",
+      "isin", "\u2208", "iuml", "\u00EF", "kappa", "\u03BA", "lArr", "\u21D0",
+      "lambda", "\u03BB", "lang", "\u2329", "laquo", "\u00AB",
+      "larr", "\u2190", "lceil", "\u2308", "ldquo", "\u201C", "le", "\u2264",
+      "lfloor", "\u230A", "lowast", "\u2217", "loz", "\u25CA", "lrm", "\u200E",
+      "lsaquo", "\u2039", "lsquo", "\u2018", "lt", "\u003C", "macr", "\u00AF",
+      "mdash", "\u2014", "micro", "\u00B5", "middot", "\u00B7",
+      "minus", "\u2212", "mu", "\u03BC", "nabla", "\u2207", "nbsp", " ",
+      "ndash", "\u2013", "ne", "\u2260", "ni", "\u220B", "not", "\u00AC",
+      "notin", "\u2209", "nsub", "\u2284", "ntilde", "\u00F1", "nu", "\u03BD",
+      "oacute", "\u00F3", "ocirc", "\u00F4", "oelig", "\u0153",
+      "ograve", "\u00F2", "oline", "\u203E", "omega", "\u03C9",
+      "omicron", "\u03BF", "oplus", "\u2295", "or", "\u2228", "ordf", "\u00AA",
+      "ordm", "\u00BA", "oslash", "\u00F8", "otilde", "\u00F5",
+      "otimes", "\u2297", "ouml", "\u00F6", "para", "\u00B6", "part", "\u2202",
+      "permil", "\u2030", "perp", "\u22A5", "phi", "\u03C6", "pi", "\u03C0",
+      "piv", "\u03D6", "plusmn", "\u00B1", "pound", "\u00A3",
+      "prime", "\u2032", "prod", "\u220F", "prop", "\u221D", "psi", "\u03C8",
+      "quot", "\"", "rArr", "\u21D2", "radic", "\u221A", "rang", "\u232A",
+      "raquo", "\u00BB", "rarr", "\u2192", "rceil", "\u2309",
+      "rdquo", "\u201D", "real", "\u211C", "reg", "\u00AE", "rfloor", "\u230B",
+      "rho", "\u03C1", "rlm", "\u200F", "rsaquo", "\u203A", "rsquo", "\u2019",
+      "sbquo", "\u201A", "scaron", "\u0161", "sdot", "\u22C5",
+      "sect", "\u00A7", "shy", "\u00AD", "sigma", "\u03C3", "sigmaf", "\u03C2",
+      "sim", "\u223C", "spades", "\u2660", "sub", "\u2282", "sube", "\u2286",
+      "sum", "\u2211", "sup", "\u2283", "sup1", "\u00B9", "sup2", "\u00B2",
+      "sup3", "\u00B3", "supe", "\u2287", "szlig", "\u00DF", "tau", "\u03C4",
+      "there4", "\u2234", "theta", "\u03B8", "thetasym", "\u03D1",
+      "thinsp", "\u2009", "thorn", "\u00FE", "tilde", "\u02DC",
+      "times", "\u00D7", "trade", "\u2122", "uArr", "\u21D1",
+      "uacute", "\u00FA", "uarr", "\u2191", "ucirc", "\u00FB",
+      "ugrave", "\u00F9", "uml", "\u00A8", "upsih", "\u03D2",
+      "upsilon", "\u03C5", "uuml", "\u00FC", "weierp", "\u2118",
+      "xi", "\u03BE", "yacute", "\u00FD", "yen", "\u00A5", "yuml", "\u00FF",
+      "zeta", "\u03B6", "zwj", "\u200D", "zwnj", "\u200C"
+    };
+    for (int i = 0 ; i < entities.length ; i += 2) {
+      Character value = entities[i + 1].charAt(0);
+      entityValues.put(entities[i], value);
+      if (upperCaseVariantsAccepted.contains(entities[i])) {
+        entityValues.put(entities[i].toUpperCase(), value);
+      }
+    }
+  }
+%}
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
@ -0,0 +1,58 @@
+/*
+ * Copyright 2010 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Generated using ICU4J 4.8.1.1 on Friday, January 13, 2012 6:20:39 PM UTC
+// by org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros
+
+
+ID_Start_Supp = (
+	  [\uD81A][\uDC00-\uDE38]
+	| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF]
+	| [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB]
+	| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF]
+	| [\uD82C][\uDC00\uDC01]
+	| [\uD804][\uDC03-\uDC37\uDC83-\uDCAF]
+	| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF]
+	| [\uD87E][\uDC00-\uDE1D]
+	| [\uD809][\uDC00-\uDC62]
+	| [\uD808][\uDC00-\uDF6E]
+	| [\uD803][\uDC00-\uDC48]
+	| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]
+	| [\uD80D][\uDC00-\uDC2E]
+	| [\uD86E][\uDC00-\uDC1D]
+	| [\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
+	| [\uD801][\uDC00-\uDC9D]
+)
+ID_Continue_Supp = (
+	  [\uD81A][\uDC00-\uDE38]
+	| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF]
+	| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF]
+	| [\uD804][\uDC00-\uDC46\uDC66-\uDC6F\uDC80-\uDCBA]
+	| [\uD82C][\uDC00\uDC01]
+	| [\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00-\uDE03\uDE05\uDE06\uDE0C-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE38-\uDE3A\uDE3F\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
+	| [\uD801][\uDC00-\uDC9D\uDCA0-\uDCA9]
+	| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF]
+	| [\uD87E][\uDC00-\uDE1D]
+	| [\uD809][\uDC00-\uDC62]
+	| [\uD808][\uDC00-\uDF6E]
+	| [\uD803][\uDC00-\uDC48]
+	| [\uD80D][\uDC00-\uDC2E]
+	| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDDFD\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]
+	| [\uD86E][\uDC00-\uDC1D]
+	| [\uDB40][\uDD00-\uDDEF]
+	| [\uD834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44]
+	| [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB\uDFCE-\uDFFF]
+)
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
@ -0,0 +1,874 @@
+package org.apache.lucene.analysis.charfilter;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.util.Version;
+import org.apache.lucene.analysis.CharStream;
+import org.apache.lucene.analysis.util.CharArrayMap;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.OpenStringBuilder;
+
+
+/**
+ * A CharFilter that wraps another Reader and attempts to strip out HTML constructs.
+ */
+@SuppressWarnings("fallthrough")
+%%
+
+%unicode 6.0
+%apiprivate
+%type int
+%final
+%public
+%char
+%function nextChar
+%class HTMLStripCharFilter
+%extends BaseCharFilter
+%xstate AMPERSAND, NUMERIC_CHARACTER, CHARACTER_REFERENCE_TAIL
+%xstate LEFT_ANGLE_BRACKET, BANG, COMMENT, SCRIPT, SCRIPT_COMMENT
+%xstate LEFT_ANGLE_BRACKET_SLASH, LEFT_ANGLE_BRACKET_SPACE, CDATA
+%xstate SERVER_SIDE_INCLUDE, SINGLE_QUOTED_STRING, DOUBLE_QUOTED_STRING
+%xstate END_TAG_TAIL_INCLUDE, END_TAG_TAIL_EXCLUDE, END_TAG_TAIL_SUBSTITUTE
+%xstate START_TAG_TAIL_INCLUDE, START_TAG_TAIL_EXCLUDE, START_TAG_TAIL_SUBSTITUTE
+%xstate STYLE, STYLE_COMMENT
+
+// From XML 1.0 <http://www.w3.org/TR/xml/>:
+//
+//    [4]  NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [...]
+//    [4a] NameChar      ::= NameStartChar | "-" | "." | [0-9] | [...]
+//    [5]  Name          ::= NameStartChar (NameChar)*
+//
+// From UAX #31: Unicode Identifier and Pattern Syntax
+// <http://unicode.org/reports/tr31/>:
+//
+//    D1. Default Identifier Syntax
+//
+//        <identifier> := <ID_Start> <ID_Continue>*
+//
+Name = ( ( [:_\p{ID_Start}] | {ID_Start_Supp} ) ( [-.:_\p{ID_Continue}] | {ID_Continue_Supp} )* )
+
+// From Apache httpd mod_include documentation
+// <http://httpd.apache.org/docs/current/mod/mod_include.html>:
+//
+// Basic Elements
+//
+//    The document is parsed as an HTML document, with special commands
+//    embedded as SGML comments. A command has the syntax:
+//
+//       <!--#element attribute=value attribute=value ... -->
+//
+//    The value will often be enclosed in double quotes, but single quotes (')
+//    and backticks (`) are also possible. Many commands only allow a single
+//    attribute-value pair. Note that the comment terminator (-->) should be
+//    preceded by whitespace to ensure that it isn't considered part of an SSI
+//    token. Note that the leading <!--# is one token and may not contain any
+//    whitespaces.
+//
+
+EventAttributeSuffixes = ( [aA][bB][oO][rR][tT]                 |
+                           [bB][lL][uU][rR]                     |
+                           [cC][hH][aA][nN][gG][eE]             |
+                           [cC][lL][iI][cC][kK]                 |
+	                         [dD][bB][lL][cC][lL][iI][cC][kK]     |
+                           [eE][rR][rR][oO][rR]                 |
+                           [fF][oO][cC][uU][sS]                 |
+	                         [kK][eE][yY][dD][oO][wW][nN]         |
+	                         [kK][eE][yY][pP][rR][eE][sS][sS]     |
+	                         [kK][eE][yY][uU][pP]                 |
+                           [lL][oO][aA][dD]                     |
+	                         [mM][oO][uU][sS][eE][dD][oO][wW][nN] |
+	                         [mM][oO][uU][sS][eE][mM][oO][vV][eE] |
+                           [mM][oO][uU][sS][eE][oO][uU][tT]     |
+                           [mM][oO][uU][sS][eE][oO][vV][eE][rR] |
+	                         [mM][oO][uU][sS][eE][uU][pP]         |
+                           [rR][eE][sS][eE][tT]                 |
+                           [sS][eE][lL][eE][cC][tT]             |
+                           [sS][uU][bB][mM][iI][tT]             |
+                           [uU][nN][lL][oO][aA][dD]             )
+
+SingleQuoted = ( "'" ( "\\'" | [^']* )* "'" )
+DoubleQuoted = ( "\"" ( "\\\"" | [^\"]* )* "\"" )
+ServerSideInclude = ( "<!--#" ( [^'\"] | {SingleQuoted} | {DoubleQuoted} )* "-->" )
+EventAttribute = [oO][nN] {EventAttributeSuffixes} \s* "=" \s* ( {SingleQuoted} | {DoubleQuoted} )
+OpenTagContent = ( {EventAttribute} | [^<>] | {ServerSideInclude} )*
+
+InlineElment = ( [aAbBiIqQsSuU]                   |
+                 [aA][bB][bB][rR]                 |
+                 [aA][cC][rR][oO][nN][yY][mM]     |
+                 [bB][aA][sS][eE][fF][oO][nN][tT] |
+                 [bB][dD][oO]                     |
+                 [bB][iI][gG]                     |
+                 [cC][iI][tT][eE]                 |
+                 [cC][oO][dD][eE]                 |
+                 [dD][fF][nN]                     |
+                 [eE][mM]                         |
+                 [fF][oO][nN][tT]                 |
+                 [iI][mM][gG]                     |
+                 [iI][nN][pP][uU][tT]             |
+                 [kK][bB][dD]                     |
+                 [lL][aA][bB][eE][lL]             |
+                 [sS][aA][mM][pP]                 |
+                 [sS][eE][lL][eE][cC][tT]         |
+                 [sS][mM][aA][lL][lL]             |
+                 [sS][pP][aA][nN]                 |
+                 [sS][tT][rR][iI][kK][eE]         |
+                 [sS][tT][rR][oO][nN][gG]         |
+                 [sS][uU][bB]                     |
+                 [sS][uU][pP]                     |
+                 [tT][eE][xX][tT][aA][rR][eE][aA] |
+                 [tT][tT]                         |
+                 [vV][aA][rR]                     )
+
+
+%include src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex
+
+%include src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
+
+%{
+  private static final int INITIAL_INPUT_SEGMENT_SIZE = 1024;
+  private static final char BLOCK_LEVEL_START_TAG_REPLACEMENT = '\n';
+  private static final char BLOCK_LEVEL_END_TAG_REPLACEMENT = '\n';
+  private static final char BR_START_TAG_REPLACEMENT = '\n';
+  private static final char BR_END_TAG_REPLACEMENT = '\n';
+  private static final char SCRIPT_REPLACEMENT = '\n';
+  private static final char STYLE_REPLACEMENT = '\n';
+  private static final char REPLACEMENT_CHARACTER = '\uFFFD';
+
+  private CharArraySet escapedTags = null;
+  private int inputStart;
+  private int cumulativeDiff;
+  private boolean escapeBR = false;
+  private boolean escapeSCRIPT = false;
+  private boolean escapeSTYLE = false;
+  private int restoreState;
+  private int previousRestoreState;
+  private int outputCharCount;
+  private int eofReturnValue;
+  private TextSegment inputSegment
+      = new TextSegment(INITIAL_INPUT_SEGMENT_SIZE);
+  private TextSegment outputSegment = inputSegment;
+  private TextSegment entitySegment = new TextSegment(2);
+
+  /**
+   * @param source
+   */
+  public HTMLStripCharFilter(CharStream source) {
+    super(source);
+    this.zzReader = source;
+  }
+
+  /**
+   * @param source
+   * @param escapedTags Tags in this set (both start and end tags)
+   *  will not be filtered out.
+   */
+  public HTMLStripCharFilter(CharStream source, Set<String> escapedTags) {
+    super(source);
+    this.zzReader = source;
+    if (null != escapedTags) {
+      for (String tag : escapedTags) {
+        if (tag.equalsIgnoreCase("BR")) {
+          escapeBR = true;
+        } else if (tag.equalsIgnoreCase("SCRIPT")) {
+          escapeSCRIPT = true;
+        } else if (tag.equalsIgnoreCase("STYLE")) {
+          escapeSTYLE = true;
+        } else {
+          if (null == this.escapedTags) {
+            this.escapedTags = new CharArraySet(Version.LUCENE_40, 16, true);
+          }
+          this.escapedTags.add(tag);
+        }
+      }
+    }
+  }
+
+  @Override
+  public int read() throws IOException {
+    if (outputSegment.isRead()) {
+      if (zzAtEOF) {
+        return -1;
+      }
+      int ch = nextChar();
+      ++outputCharCount;
+      return ch;
+    }
+    int ch = outputSegment.nextChar();
+    ++outputCharCount;
+    return ch;
+  }
+
+  @Override
+  public int read(char cbuf[], int off, int len) throws IOException {
+    int i = 0;
+    for ( ; i < len ; ++i) {
+      int ch = read();
+      if (ch == -1) break;
+      cbuf[off++] = (char)ch;
+    }
+    return i > 0 ? i : (len == 0 ? 0 : -1);
+  }
+
+  @Override
+  public void close() throws IOException {
+    yyclose();
+  }
+
+  static int getInitialBufferSize() {  // Package private, for testing purposes
+    return ZZ_BUFFERSIZE;
+  }
+
+  private class TextSegment extends OpenStringBuilder {
+    /** The position from which the next char will be read. */
+    int pos = 0;
+
+    /** Wraps the given buffer and sets this.len to the given length. */
+    TextSegment(char[] buffer, int length) {
+      super(buffer, length);
+    }
+
+    /** Allocates an internal buffer of the given size. */
+    TextSegment(int size) {
+      super(size);
+    }
+
+    /** Sets len = 0 and pos = 0. */
+    void clear() {
+      reset();
+      restart();
+    }
+
+    /** Sets pos = 0 */
+    void restart() {
+      pos = 0;
+    }
+
+    /** Returns the next char in the segment. */
+    int nextChar() {
+      assert (! isRead()): "Attempting to read past the end of a segment.";
+      return buf[pos++];
+    }
+
+    /** Returns true when all characters in the text segment have been read */
+    boolean isRead() {
+      return pos >= len;
+    }
+  }
+%}
+
+%eofval{
+  return eofReturnValue;
+%eofval}
+%eof{
+  switch (zzLexicalState) {
+    case SCRIPT:
+    case COMMENT:
+    case SCRIPT_COMMENT:
+    case STYLE:
+    case STYLE_COMMENT:
+    case SINGLE_QUOTED_STRING:
+    case DOUBLE_QUOTED_STRING:
+    case END_TAG_TAIL_EXCLUDE:
+    case END_TAG_TAIL_SUBSTITUTE:
+    case START_TAG_TAIL_EXCLUDE:
+    case SERVER_SIDE_INCLUDE:
+    case START_TAG_TAIL_SUBSTITUTE: { // Exclude
+      cumulativeDiff += yychar - inputStart;
+      addOffCorrectMap(outputCharCount, cumulativeDiff);
+      outputSegment.clear();
+      eofReturnValue = -1;
+      break;
+    }
+    case CHARACTER_REFERENCE_TAIL: {        // Substitute
+      // At end of file, allow char refs without semicolons
+      cumulativeDiff += inputSegment.length() - outputSegment.length();
+      addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
+      eofReturnValue = outputSegment.nextChar();
+      break;
+    }
+    case BANG:
+    case CDATA:
+    case AMPERSAND:
+    case NUMERIC_CHARACTER:
+    case END_TAG_TAIL_INCLUDE:
+    case START_TAG_TAIL_INCLUDE:
+    case LEFT_ANGLE_BRACKET:
+    case LEFT_ANGLE_BRACKET_SLASH:
+    case LEFT_ANGLE_BRACKET_SPACE: {        // Include
+      outputSegment = inputSegment;
+      eofReturnValue = outputSegment.nextChar();
+      break;
+    }
+    default: {
+      eofReturnValue = -1;
+    }
+  }
+%eof}
+
+%%
+
+"&" {
+  inputStart = yychar;
+  inputSegment.clear();
+  inputSegment.append('&');
+  yybegin(AMPERSAND);
+}
+
+"<" {
+  inputStart = yychar;
+  inputSegment.clear();
+  inputSegment.append('<');
+  yybegin(LEFT_ANGLE_BRACKET);
+}
+
+<AMPERSAND> {
+  {CharacterEntities} {
+    int length = yylength();
+    inputSegment.write(zzBuffer, zzStartRead, length);
+    entitySegment.clear();
+    char ch = entityValues.get(zzBuffer, zzStartRead, length).charValue();
+    entitySegment.append(ch);
+    outputSegment = entitySegment;
+    yybegin(CHARACTER_REFERENCE_TAIL);
+  }
+  "#" { inputSegment.append('#'); yybegin(NUMERIC_CHARACTER); }
+
+//                                             1   1       11              11
+// 0  1   2   3       45              678  9   0   1       23              45
+  "#" [xX][dD][89aAbB][0-9a-fA-F]{2} ";&#" [xX][dD][c-fC-F][0-9a-fA-F]{2} ";" {
+    // Handle paired UTF-16 surrogates.
+    outputSegment = entitySegment;
+    outputSegment.clear();
+    String surrogatePair = yytext();
+    char highSurrogate = '\u0000';
+    try {
+      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
+    } catch(Exception e) { // should never happen
+      assert false: "Exception parsing high surrogate '"
+                  + surrogatePair.substring(2, 6) + "'";
+    }
+    try {
+      outputSegment.unsafeWrite
+          ((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
+    } catch(Exception e) { // should never happen
+      assert false: "Exception parsing low surrogate '"
+                  + surrogatePair.substring(10, 14) + "'";
+    }
+    cumulativeDiff += inputSegment.length() + yylength() - 2;
+    addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+    return highSurrogate;
+  }
+
+//                          1   1       11              11
+// 01  2    345    678  9   0   1       23              45
+  "#5" [56] \d{3} ";&#" [xX][dD][c-fC-F][0-9a-fA-F]{2} ";" {
+    // Handle paired UTF-16 surrogates.
+    String surrogatePair = yytext();
+    char highSurrogate = '\u0000';
+    try { // High surrogates are in decimal range [55296, 56319]
+      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
+    } catch(Exception e) { // should never happen
+      assert false: "Exception parsing high surrogate '"
+                  + surrogatePair.substring(1, 6) + "'";
+    }
+    if (Character.isHighSurrogate(highSurrogate)) {
+      outputSegment = entitySegment;
+      outputSegment.clear();
+      try {
+        outputSegment.unsafeWrite
+            ((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
+      } catch(Exception e) { // should never happen
+        assert false: "Exception parsing low surrogate '"
+                    + surrogatePair.substring(10, 14) + "'";
+      }
+      cumulativeDiff += inputSegment.length() + yylength() - 2;
+      addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
+      inputSegment.clear();
+      yybegin(YYINITIAL);
+      return highSurrogate;
+    }
+    yypushback(surrogatePair.length() - 1); // Consume only '#'
+    inputSegment.append('#');
+    yybegin(NUMERIC_CHARACTER);
+  }
+
+//                                          1    111     11
+// 0  1   2   3       45              6789  0    123     45
+  "#" [xX][dD][89aAbB][0-9a-fA-F]{2} ";&#5" [67] \d{3}  ";" {
+    // Handle paired UTF-16 surrogates.
+    String surrogatePair = yytext();
+    char highSurrogate = '\u0000';
+    char lowSurrogate = '\u0000';
+    try {
+      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
+    } catch(Exception e) { // should never happen
+      assert false: "Exception parsing high surrogate '"
+                  + surrogatePair.substring(2, 6) + "'";
+    }
+    try { // Low surrogates are in decimal range [56320, 57343]
+      lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
+    } catch(Exception e) { // should never happen
+      assert false: "Exception parsing low surrogate '"
+                  + surrogatePair.substring(9, 14) + "'";
+    }
+    if (Character.isLowSurrogate(lowSurrogate)) {
+      outputSegment = entitySegment;
+      outputSegment.clear();
+      outputSegment.unsafeWrite(lowSurrogate);
+      cumulativeDiff += inputSegment.length() + yylength() - 2;
+      addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
+      inputSegment.clear();
+      yybegin(YYINITIAL);
+      return highSurrogate;
+    }
+    yypushback(surrogatePair.length() - 1); // Consume only '#'
+    inputSegment.append('#');
+    yybegin(NUMERIC_CHARACTER);
+  }
+
+//                       1    111     11
+// 01  2    345    6789  0    123     45
+  "#5" [56] \d{3} ";&#5" [67] \d{3}  ";" {
+    // Handle paired UTF-16 surrogates.
+    String surrogatePair = yytext();
+    char highSurrogate = '\u0000';
+    try { // High surrogates are in decimal range [55296, 56319]
+      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
+    } catch(Exception e) { // should never happen
+      assert false: "Exception parsing high surrogate '"
+                  + surrogatePair.substring(1, 6) + "'";
+    }
+    if (Character.isHighSurrogate(highSurrogate)) {
+      char lowSurrogate = '\u0000';
+      try { // Low surrogates are in decimal range [56320, 57343]
+        lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
+      } catch(Exception e) { // should never happen
+        assert false: "Exception parsing low surrogate '"
+                    + surrogatePair.substring(9, 14) + "'";
+      }
+      if (Character.isLowSurrogate(lowSurrogate)) {
+        outputSegment = entitySegment;
+        outputSegment.clear();
+        outputSegment.unsafeWrite(lowSurrogate);
+        cumulativeDiff += inputSegment.length() + yylength() - 2;
+        addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
+        inputSegment.clear();
+        yybegin(YYINITIAL);
+        return highSurrogate;
+      }
+    }
+    yypushback(surrogatePair.length() - 1); // Consume only '#'
+    inputSegment.append('#');
+    yybegin(NUMERIC_CHARACTER);
+  }
+}
+
+<NUMERIC_CHARACTER> {
+  [xX] [0-9A-Fa-f]+ {
+    int matchLength = yylength();
+    inputSegment.write(zzBuffer, zzStartRead, matchLength);
+    if (matchLength <= 6) { // 10FFFF: max 6 hex chars
+      String hexCharRef
+          = new String(zzBuffer, zzStartRead + 1, matchLength - 1);
+      int codePoint = 0;
+      try {
+        codePoint = Integer.parseInt(hexCharRef, 16);
+      } catch(Exception e) {
+        assert false: "Exception parsing hex code point '" + hexCharRef + "'";
+      }
+      if (codePoint <= 0x10FFFF) {
+        outputSegment = entitySegment;
+        outputSegment.clear();
+        if (codePoint >= Character.MIN_SURROGATE
+            && codePoint <= Character.MAX_SURROGATE) {
+          outputSegment.unsafeWrite(REPLACEMENT_CHARACTER);
+        } else {
+          outputSegment.setLength
+              (Character.toChars(codePoint, outputSegment.getArray(), 0));
+        }
+        yybegin(CHARACTER_REFERENCE_TAIL);
+      } else {
+        outputSegment = inputSegment;
+        yybegin(YYINITIAL);
+        return outputSegment.nextChar();
+      }
+    } else {
+      outputSegment = inputSegment;
+      yybegin(YYINITIAL);
+      return outputSegment.nextChar();
+    }
+  }
+  [0-9]+ {
+    int matchLength = yylength();
+    inputSegment.write(zzBuffer, zzStartRead, matchLength);
+    if (matchLength <= 7) { // 0x10FFFF = 1114111: max 7 decimal chars
+      String decimalCharRef = yytext();
+      int codePoint = 0;
+      try {
+        codePoint = Integer.parseInt(decimalCharRef);
+      } catch(Exception e) {
+        assert false: "Exception parsing code point '" + decimalCharRef + "'";
+      }
+      if (codePoint <= 0x10FFFF) {
+        outputSegment = entitySegment;
+        outputSegment.clear();
+        if (codePoint >= Character.MIN_SURROGATE
+            && codePoint <= Character.MAX_SURROGATE) {
+          outputSegment.unsafeWrite(REPLACEMENT_CHARACTER);
+        } else {
+          outputSegment.setLength
+              (Character.toChars(codePoint, outputSegment.getArray(), 0));
+        }
+        yybegin(CHARACTER_REFERENCE_TAIL);
+      } else {
+        outputSegment = inputSegment;
+        yybegin(YYINITIAL);
+        return outputSegment.nextChar();
+      }
+    } else {
+      outputSegment = inputSegment;
+      yybegin(YYINITIAL);
+      return outputSegment.nextChar();
+    }
+  }
+}
+
+<CHARACTER_REFERENCE_TAIL> {
+  ";" {
+    cumulativeDiff
+        += inputSegment.length() + yylength() - outputSegment.length();
+    addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
+    yybegin(YYINITIAL);
+    return outputSegment.nextChar();
+  }
+}
+
+<LEFT_ANGLE_BRACKET_SLASH> {
+  \s+ { inputSegment.write(zzBuffer, zzStartRead, yylength()); }
+  [bB][rR] \s* ">" {
+    yybegin(YYINITIAL);
+    if (escapeBR) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      return outputSegment.nextChar();
+    } else {
+      cumulativeDiff
+          += inputSegment.length() + yylength() - outputSegment.length();
+      addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
+      inputSegment.reset();
+      return BR_END_TAG_REPLACEMENT;
+    }
+  }
+  {InlineElment} {
+    inputSegment.write(zzBuffer, zzStartRead, yylength());
+    if (null != escapedTags
+        && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
+      yybegin(END_TAG_TAIL_INCLUDE);
+    } else {
+      yybegin(END_TAG_TAIL_EXCLUDE);
+    }
+  }
+  {Name} {
+    inputSegment.write(zzBuffer, zzStartRead, yylength());
+    if (null != escapedTags
+        && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
+      yybegin(END_TAG_TAIL_INCLUDE);
+    } else {
+      yybegin(END_TAG_TAIL_SUBSTITUTE);
+    }
+  }
+}
+
+<END_TAG_TAIL_INCLUDE> {
+   \s* ">" {
+     inputSegment.write(zzBuffer, zzStartRead, yylength());
+     outputSegment = inputSegment;
+     yybegin(YYINITIAL);
+     return outputSegment.nextChar();
+   }
+}
+
+<END_TAG_TAIL_EXCLUDE> {
+  \s* ">" {
+    cumulativeDiff += inputSegment.length() + yylength();
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+  }
+}
+
+<END_TAG_TAIL_SUBSTITUTE> {
+  \s* ">" {
+    cumulativeDiff += inputSegment.length() + yylength() - 1;
+    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+    return BLOCK_LEVEL_END_TAG_REPLACEMENT;
+  }
+}
+
+<LEFT_ANGLE_BRACKET> {
+  "!" { inputSegment.append('!'); yybegin(BANG); }
+  "/" { inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH); }
+  \s+ {
+    inputSegment.write(zzBuffer, zzStartRead, yylength());
+    yybegin(LEFT_ANGLE_BRACKET_SPACE);
+  }
+  "?" [^>]* [/?] ">" {
+    cumulativeDiff += inputSegment.length() + yylength();
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+  }
+  \s* [bB][rR] ( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
+    yybegin(YYINITIAL);
+    if (escapeBR) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      return outputSegment.nextChar();
+    } else {
+      cumulativeDiff
+          += inputSegment.length() + yylength() - outputSegment.length();
+      addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
+      inputSegment.reset();
+      return BR_START_TAG_REPLACEMENT;
+    }
+  }
+  \s* [sS][cC][rR][iI][pP][tT] ( \s+ {OpenTagContent} )? \s*  ">" {
+    yybegin(SCRIPT);
+    if (escapeSCRIPT) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      inputStart += 1 + yylength();
+      return outputSegment.nextChar();
+    }
+  }
+  \s* [sS][tT][yY][lL][eE] ( \s+ {OpenTagContent} )? \s* ">" {
+    yybegin(STYLE);
+    if (escapeSTYLE) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      inputStart += 1 + yylength();
+      return outputSegment.nextChar();
+    }
+  }
+}
+
+<LEFT_ANGLE_BRACKET, LEFT_ANGLE_BRACKET_SPACE> {
+  {InlineElment} {
+    inputSegment.write(zzBuffer, zzStartRead, yylength());
+    if (null != escapedTags
+        && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
+      yybegin(START_TAG_TAIL_INCLUDE);
+    } else {
+      yybegin(START_TAG_TAIL_EXCLUDE);
+    }
+  }
+  {Name} {
+    inputSegment.write(zzBuffer, zzStartRead, yylength());
+    if (null != escapedTags
+        && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
+      yybegin(START_TAG_TAIL_INCLUDE);
+    } else {
+      yybegin(START_TAG_TAIL_SUBSTITUTE);
+    }
+  }
+}
+
+<START_TAG_TAIL_INCLUDE> {
+   ( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
+     inputSegment.write(zzBuffer, zzStartRead, yylength());
+     outputSegment = inputSegment;
+     yybegin(YYINITIAL);
+     return outputSegment.nextChar();
+   }
+}
+
+<START_TAG_TAIL_EXCLUDE> {
+   ( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
+    cumulativeDiff += inputSegment.length() + yylength();
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    outputSegment = inputSegment;
+    yybegin(YYINITIAL);
+  }
+}
+
+<START_TAG_TAIL_SUBSTITUTE> {
+  ( ( "="\s* | \s+ ) {OpenTagContent} )? \s*  "/"? ">" {
+    cumulativeDiff += inputSegment.length() + yylength() - 1;
+    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+    return BLOCK_LEVEL_START_TAG_REPLACEMENT;
+  }
+}
+
+<BANG> {
+  "--" { yybegin(COMMENT); }
+  ">" {
+    cumulativeDiff += inputSegment.length() + yylength();
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+  }
+  // From XML 1.0 <http://www.w3.org/TR/xml/>:
+  //
+  // [18] CDSect  ::= CDStart CData CDEnd
+  // [19] CDStart ::= '<![CDATA['
+  // [20] CData   ::= (Char* - (Char* ']]>' Char*))
+  // [21] CDEnd   ::= ']]>'
+  //
+  "[CDATA[" {
+    cumulativeDiff += inputSegment.length() + yylength();
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(CDATA);
+  }
+  [^] {
+    inputSegment.append(zzBuffer[zzStartRead]);
+  }
+}
+
+<CDATA> {
+  "]]>" {
+    cumulativeDiff += yylength();
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    yybegin(YYINITIAL);
+  }
+  [^] { return zzBuffer[zzStartRead]; }
+}
+
+<COMMENT> {
+  "<!--#" { restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
+  "-->" {
+    cumulativeDiff += yychar - inputStart + yylength();
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+  }
+  [^] { }
+}
+
+<SERVER_SIDE_INCLUDE> {
+  "-->" { yybegin(restoreState); }
+  "'" {
+    previousRestoreState = restoreState;
+    restoreState = SERVER_SIDE_INCLUDE;
+    yybegin(SINGLE_QUOTED_STRING);
+  }
+  "\"" {
+    previousRestoreState = restoreState;
+    restoreState = SERVER_SIDE_INCLUDE;
+    yybegin(DOUBLE_QUOTED_STRING);
+  }
+  [^] { }
+}
+
+<SCRIPT_COMMENT> {
+  "<!--#" { restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
+  "'"     { restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING); }
+  "\""    { restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING); }
+  "-->"   { yybegin(SCRIPT); }
+  [^] { }
+}
+
+<STYLE_COMMENT> {
+  "<!--#" { restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
+  "'"     { restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING); }
+  "\""    { restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING); }
+  "-->"   { yybegin(STYLE); }
+  [^] { }
+}
+
+<SINGLE_QUOTED_STRING> {
+  "\\" [^] { }
+  "'" { yybegin(restoreState); restoreState = previousRestoreState; }
+  [^] { }
+}
+
+<DOUBLE_QUOTED_STRING> {
+  "\\" [^] { }
+  "\"" { yybegin(restoreState); restoreState = previousRestoreState; }
+  [^] { }
+}
+
+<SCRIPT> {
+  "<!--" { yybegin(SCRIPT_COMMENT); }
+  "</" \s* [sS][cC][rR][iI][pP][tT] \s* ">" {
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+    cumulativeDiff += yychar - inputStart;
+    int outputEnd = outputCharCount;
+    int returnValue;
+    if (escapeSCRIPT) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      returnValue = outputSegment.nextChar();
+    } else {
+      cumulativeDiff += yylength() - 1;
+      ++outputEnd;
+      returnValue = SCRIPT_REPLACEMENT;
+    }
+    addOffCorrectMap(outputEnd, cumulativeDiff);
+    return returnValue;
+  }
+  [^] { }
+}
+
+<STYLE> {
+  "<!--" { yybegin(STYLE_COMMENT); }
+  "</" \s* [sS][tT][yY][lL][eE] \s* ">" {
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+    cumulativeDiff += yychar - inputStart;
+    int outputEnd = outputCharCount;
+    int returnValue;
+    if (escapeSTYLE) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      returnValue = outputSegment.nextChar();
+    } else {
+      cumulativeDiff += yylength() - 1;
+      ++outputEnd;
+      returnValue = STYLE_REPLACEMENT;
+    }
+    addOffCorrectMap(outputEnd, cumulativeDiff);
+    return returnValue;
+  }
+  [^] { }
+}
+
+<AMPERSAND,NUMERIC_CHARACTER,CHARACTER_REFERENCE_TAIL,LEFT_ANGLE_BRACKET_SLASH,END_TAG_TAIL_INCLUDE,END_TAG_TAIL_EXCLUDE,END_TAG_TAIL_SUBSTITUTE,LEFT_ANGLE_BRACKET,LEFT_ANGLE_BRACKET_SPACE,START_TAG_TAIL_INCLUDE,START_TAG_TAIL_EXCLUDE,START_TAG_TAIL_SUBSTITUTE,BANG> {
+  [^] {
+    yypushback(1);
+    outputSegment = inputSegment;
+    outputSegment.restart();
+    yybegin(YYINITIAL);
+    return outputSegment.nextChar();
+  }
+}
+
+[^] { return zzBuffer[zzStartRead]; }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py
@ -0,0 +1,530 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+# A simple python script to generate an HTML entity map and a regex alternation
+# for inclusion in HTMLStripCharFilter.jflex.
+
+def main():
+  print get_apache_license()
+  codes = {}
+  regex = re.compile(r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"')
+  for line in get_entity_text().split('\n'):
+    match = regex.match(line)
+    if match:
+      key = match.group(1)
+      if   key == 'quot': codes[key] = r'\"'
+      elif key == 'nbsp': codes[key] = ' ';
+      else              : codes[key] = r'\u%04X' % int(match.group(2))
+
+  keys = sorted(codes)
+
+  first_entry = True
+  output_line = 'CharacterEntities = ( '
+  for key in keys:
+    new_entry = ('"%s"' if first_entry else ' | "%s"') % key
+    first_entry = False
+    if len(output_line) + len(new_entry) >= 80:
+      print output_line
+      output_line = '                   '
+    output_line += new_entry
+    if key in ('quot','copy','gt','lt','reg','amp'):
+      new_entry = ' | "%s"' % key.upper()
+      if len(output_line) + len(new_entry) >= 80:
+        print output_line
+        output_line = '                   '
+      output_line += new_entry
+  print output_line, ')'
+
+  print '%{'
+  print '  private static final Set<String> upperCaseVariantsAccepted'
+  print '      = new HashSet<String>(Arrays.asList("quot","copy","gt","lt","reg","amp"));'
+  print '  private static final CharArrayMap<Character> entityValues'
+  print '      = new CharArrayMap<Character>(Version.LUCENE_40, %i, false);' % len(keys)
+  print '  static {'
+  print '    String[] entities = {'
+  output_line = '     '
+  for key in keys:
+    new_entry = ' "%s", "%s",' % (key, codes[key])
+    if len(output_line) + len(new_entry) >= 80:
+      print output_line
+      output_line = '     '
+    output_line += new_entry
+  print output_line[:-1]
+  print '    };'
+  print '    for (int i = 0 ; i < entities.length ; i += 2) {'
+  print '      Character value = entities[i + 1].charAt(0);'
+  print '      entityValues.put(entities[i], value);'
+  print '      if (upperCaseVariantsAccepted.contains(entities[i])) {'
+  print '        entityValues.put(entities[i].toUpperCase(), value);'
+  print '      }'
+  print '    }'
+  print "  }"
+  print "%}"
+
+def get_entity_text():
+# The text below is taken verbatim from
+# <http://www.w3.org/TR/REC-html40/sgml/entities.html>:
+  text = r"""
+F.1. XHTML Character Entities
+
+XHTML DTDs make available a standard collection of named character entities. Those entities are defined in this section.
+F.1.1. XHTML Latin 1 Character Entities
+
+You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-lat1.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent.
+
+<!-- ...................................................................... -->
+<!-- XML-compatible ISO Latin 1 Character Entity Set for XHTML ............ -->
+<!-- file: xhtml-lat1.ent
+
+     Typical invocation:
+
+       <!ENTITY % xhtml-lat1
+           PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
+                  "xhtml-lat1.ent" >
+       %xhtml-lat1;
+
+     This DTD module is identified by the PUBLIC and SYSTEM identifiers:
+
+       PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
+       SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent"
+
+     Revision:  $Id: xhtml-lat1.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
+
+     Portions (C) International Organization for Standardization 1986:
+     Permission to copy in any form is granted for use with conforming
+     SGML systems and applications as defined in ISO 8879, provided
+     this notice is included in all copies.
+-->
+
+<!ENTITY nbsp   "&#160;" ><!-- no-break space = non-breaking space, U+00A0 ISOnum -->
+<!ENTITY iexcl  "&#161;" ><!-- inverted exclamation mark, U+00A1 ISOnum -->
+<!ENTITY cent   "&#162;" ><!-- cent sign, U+00A2 ISOnum -->
+<!ENTITY pound  "&#163;" ><!-- pound sign, U+00A3 ISOnum -->
+<!ENTITY curren "&#164;" ><!-- currency sign, U+00A4 ISOnum -->
+<!ENTITY yen    "&#165;" ><!-- yen sign = yuan sign, U+00A5 ISOnum -->
+<!ENTITY brvbar "&#166;" ><!-- broken bar = broken vertical bar, U+00A6 ISOnum -->
+<!ENTITY sect   "&#167;" ><!-- section sign, U+00A7 ISOnum -->
+<!ENTITY uml    "&#168;" ><!-- diaeresis = spacing diaeresis, U+00A8 ISOdia -->
+<!ENTITY copy   "&#169;" ><!-- copyright sign, U+00A9 ISOnum -->
+<!ENTITY ordf   "&#170;" ><!-- feminine ordinal indicator, U+00AA ISOnum -->
+<!ENTITY laquo  "&#171;" ><!-- left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum -->
+<!ENTITY not    "&#172;" ><!-- not sign, U+00AC ISOnum -->
+<!ENTITY shy    "&#173;" ><!-- soft hyphen = discretionary hyphen, U+00AD ISOnum -->
+<!ENTITY reg    "&#174;" ><!-- registered sign = registered trade mark sign, U+00AE ISOnum -->
+<!ENTITY macr   "&#175;" ><!-- macron = spacing macron = overline = APL overbar, U+00AF ISOdia -->
+<!ENTITY deg    "&#176;" ><!-- degree sign, U+00B0 ISOnum -->
+<!ENTITY plusmn "&#177;" ><!-- plus-minus sign = plus-or-minus sign, U+00B1 ISOnum -->
+<!ENTITY sup2   "&#178;" ><!-- superscript two = superscript digit two = squared, U+00B2 ISOnum -->
+<!ENTITY sup3   "&#179;" ><!-- superscript three = superscript digit three = cubed, U+00B3 ISOnum -->
+<!ENTITY acute  "&#180;" ><!-- acute accent = spacing acute, U+00B4 ISOdia -->
+<!ENTITY micro  "&#181;" ><!-- micro sign, U+00B5 ISOnum -->
+<!ENTITY para   "&#182;" ><!-- pilcrow sign = paragraph sign, U+00B6 ISOnum -->
+<!ENTITY middot "&#183;" ><!-- middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum -->
+<!ENTITY cedil  "&#184;" ><!-- cedilla = spacing cedilla, U+00B8 ISOdia -->
+<!ENTITY sup1   "&#185;" ><!-- superscript one = superscript digit one, U+00B9 ISOnum -->
+<!ENTITY ordm   "&#186;" ><!-- masculine ordinal indicator, U+00BA ISOnum -->
+<!ENTITY raquo  "&#187;" ><!-- right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum -->
+<!ENTITY frac14 "&#188;" ><!-- vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum -->
+<!ENTITY frac12 "&#189;" ><!-- vulgar fraction one half = fraction one half, U+00BD ISOnum -->
+<!ENTITY frac34 "&#190;" ><!-- vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum -->
+<!ENTITY iquest "&#191;" ><!-- inverted question mark = turned question mark, U+00BF ISOnum -->
+<!ENTITY Agrave "&#192;" ><!-- latin capital A with grave = latin capital A grave, U+00C0 ISOlat1 -->
+<!ENTITY Aacute "&#193;" ><!-- latin capital A with acute, U+00C1 ISOlat1 -->
+<!ENTITY Acirc  "&#194;" ><!-- latin capital A with circumflex, U+00C2 ISOlat1 -->
+<!ENTITY Atilde "&#195;" ><!-- latin capital A with tilde, U+00C3 ISOlat1 -->
+<!ENTITY Auml   "&#196;" ><!-- latin capital A with diaeresis, U+00C4 ISOlat1 -->
+<!ENTITY Aring  "&#197;" ><!-- latin capital A with ring above = latin capital A ring, U+00C5 ISOlat1 -->
+<!ENTITY AElig  "&#198;" ><!-- latin capital AE = latin capital ligature AE, U+00C6 ISOlat1 -->
+<!ENTITY Ccedil "&#199;" ><!-- latin capital C with cedilla, U+00C7 ISOlat1 -->
+<!ENTITY Egrave "&#200;" ><!-- latin capital E with grave, U+00C8 ISOlat1 -->
+<!ENTITY Eacute "&#201;" ><!-- latin capital E with acute, U+00C9 ISOlat1 -->
+<!ENTITY Ecirc  "&#202;" ><!-- latin capital E with circumflex, U+00CA ISOlat1 -->
+<!ENTITY Euml   "&#203;" ><!-- latin capital E with diaeresis, U+00CB ISOlat1 -->
+<!ENTITY Igrave "&#204;" ><!-- latin capital I with grave, U+00CC ISOlat1 -->
+<!ENTITY Iacute "&#205;" ><!-- latin capital I with acute, U+00CD ISOlat1 -->
+<!ENTITY Icirc  "&#206;" ><!-- latin capital I with circumflex, U+00CE ISOlat1 -->
+<!ENTITY Iuml   "&#207;" ><!-- latin capital I with diaeresis, U+00CF ISOlat1 -->
+<!ENTITY ETH    "&#208;" ><!-- latin capital ETH, U+00D0 ISOlat1 -->
+<!ENTITY Ntilde "&#209;" ><!-- latin capital N with tilde, U+00D1 ISOlat1 -->
+<!ENTITY Ograve "&#210;" ><!-- latin capital O with grave, U+00D2 ISOlat1 -->
+<!ENTITY Oacute "&#211;" ><!-- latin capital O with acute, U+00D3 ISOlat1 -->
+<!ENTITY Ocirc  "&#212;" ><!-- latin capital O with circumflex, U+00D4 ISOlat1 -->
+<!ENTITY Otilde "&#213;" ><!-- latin capital O with tilde, U+00D5 ISOlat1 -->
+<!ENTITY Ouml   "&#214;" ><!-- latin capital O with diaeresis, U+00D6 ISOlat1 -->
+<!ENTITY times  "&#215;" ><!-- multiplication sign, U+00D7 ISOnum -->
+<!ENTITY Oslash "&#216;" ><!-- latin capital O with stroke = latin capital O slash, U+00D8 ISOlat1 -->
+<!ENTITY Ugrave "&#217;" ><!-- latin capital U with grave, U+00D9 ISOlat1 -->
+<!ENTITY Uacute "&#218;" ><!-- latin capital U with acute, U+00DA ISOlat1 -->
+<!ENTITY Ucirc  "&#219;" ><!-- latin capital U with circumflex, U+00DB ISOlat1 -->
+<!ENTITY Uuml   "&#220;" ><!-- latin capital U with diaeresis, U+00DC ISOlat1 -->
+<!ENTITY Yacute "&#221;" ><!-- latin capital Y with acute, U+00DD ISOlat1 -->
+<!ENTITY THORN  "&#222;" ><!-- latin capital THORN, U+00DE ISOlat1 -->
+<!ENTITY szlig  "&#223;" ><!-- latin small sharp s = ess-zed, U+00DF ISOlat1 -->
+<!ENTITY agrave "&#224;" ><!-- latin small a with grave = latin small a grave, U+00E0 ISOlat1 -->
+<!ENTITY aacute "&#225;" ><!-- latin small a with acute, U+00E1 ISOlat1 -->
+<!ENTITY acirc  "&#226;" ><!-- latin small a with circumflex, U+00E2 ISOlat1 -->
+<!ENTITY atilde "&#227;" ><!-- latin small a with tilde, U+00E3 ISOlat1 -->
+<!ENTITY auml   "&#228;" ><!-- latin small a with diaeresis, U+00E4 ISOlat1 -->
+<!ENTITY aring  "&#229;" ><!-- latin small a with ring above = latin small a ring, U+00E5 ISOlat1 -->
+<!ENTITY aelig  "&#230;" ><!-- latin small ae = latin small ligature ae, U+00E6 ISOlat1 -->
+<!ENTITY ccedil "&#231;" ><!-- latin small c with cedilla, U+00E7 ISOlat1 -->
+<!ENTITY egrave "&#232;" ><!-- latin small e with grave, U+00E8 ISOlat1 -->
+<!ENTITY eacute "&#233;" ><!-- latin small e with acute, U+00E9 ISOlat1 -->
+<!ENTITY ecirc  "&#234;" ><!-- latin small e with circumflex, U+00EA ISOlat1 -->
+<!ENTITY euml   "&#235;" ><!-- latin small e with diaeresis, U+00EB ISOlat1 -->
+<!ENTITY igrave "&#236;" ><!-- latin small i with grave, U+00EC ISOlat1 -->
+<!ENTITY iacute "&#237;" ><!-- latin small i with acute, U+00ED ISOlat1 -->
+<!ENTITY icirc  "&#238;" ><!-- latin small i with circumflex, U+00EE ISOlat1 -->
+<!ENTITY iuml   "&#239;" ><!-- latin small i with diaeresis, U+00EF ISOlat1 -->
+<!ENTITY eth    "&#240;" ><!-- latin small eth, U+00F0 ISOlat1 -->
+<!ENTITY ntilde "&#241;" ><!-- latin small n with tilde, U+00F1 ISOlat1 -->
+<!ENTITY ograve "&#242;" ><!-- latin small o with grave, U+00F2 ISOlat1 -->
+<!ENTITY oacute "&#243;" ><!-- latin small o with acute, U+00F3 ISOlat1 -->
+<!ENTITY ocirc  "&#244;" ><!-- latin small o with circumflex, U+00F4 ISOlat1 -->
+<!ENTITY otilde "&#245;" ><!-- latin small o with tilde, U+00F5 ISOlat1 -->
+<!ENTITY ouml   "&#246;" ><!-- latin small o with diaeresis, U+00F6 ISOlat1 -->
+<!ENTITY divide "&#247;" ><!-- division sign, U+00F7 ISOnum -->
+<!ENTITY oslash "&#248;" ><!-- latin small o with stroke, = latin small o slash, U+00F8 ISOlat1 -->
+<!ENTITY ugrave "&#249;" ><!-- latin small u with grave, U+00F9 ISOlat1 -->
+<!ENTITY uacute "&#250;" ><!-- latin small u with acute, U+00FA ISOlat1 -->
+<!ENTITY ucirc  "&#251;" ><!-- latin small u with circumflex, U+00FB ISOlat1 -->
+<!ENTITY uuml   "&#252;" ><!-- latin small u with diaeresis, U+00FC ISOlat1 -->
+<!ENTITY yacute "&#253;" ><!-- latin small y with acute, U+00FD ISOlat1 -->
+<!ENTITY thorn  "&#254;" ><!-- latin small thorn with, U+00FE ISOlat1 -->
+<!ENTITY yuml   "&#255;" ><!-- latin small y with diaeresis, U+00FF ISOlat1 -->
+<!-- end of xhtml-lat1.ent -->
+
+F.1.2. XHTML Special Characters
+
+You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-special.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-special.ent.
+
+<!-- ...................................................................... -->
+<!-- XML-compatible ISO Special Character Entity Set for XHTML ............ -->
+<!-- file: xhtml-special.ent
+
+     Typical invocation:
+
+       <!ENTITY % xhtml-special
+           PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
+                  "xhtml-special.ent" >
+       %xhtml-special;
+
+     This DTD module is identified by the PUBLIC and SYSTEM identifiers:
+
+       PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
+       SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-special.ent"
+
+     Revision:  $Id: xhtml-special.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
+
+     Portions (C) International Organization for Standardization 1986:
+     Permission to copy in any form is granted for use with conforming
+     SGML systems and applications as defined in ISO 8879, provided
+     this notice is included in all copies.
+
+     Revisions:
+2000-10-28: added &apos; and altered XML Predefined Entities for compatibility
+-->
+
+<!-- Relevant ISO entity set is given unless names are newly introduced.
+     New names (i.e., not in ISO 8879 [SGML] list) do not clash with
+     any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
+     numbers are given for each character, in hex. Entity values are
+     decimal conversions of the ISO 10646 values and refer to the
+     document character set. Names are Unicode [UNICODE] names.
+-->
+
+<!-- C0 Controls and Basic Latin -->
+<!ENTITY lt      "&#38;#60;" ><!-- less-than sign, U+003C ISOnum -->
+<!ENTITY gt      "&#62;" ><!-- greater-than sign, U+003E ISOnum -->
+<!ENTITY amp     "&#38;#38;" ><!-- ampersand, U+0026 ISOnum -->
+<!ENTITY apos    "&#39;" ><!-- The Apostrophe (Apostrophe Quote, APL Quote), U+0027 ISOnum -->
+<!ENTITY quot    "&#34;" ><!-- quotation mark (Quote Double), U+0022 ISOnum -->
+
+<!-- Latin Extended-A -->
+<!ENTITY OElig   "&#338;" ><!-- latin capital ligature OE, U+0152 ISOlat2 -->
+<!ENTITY oelig   "&#339;" ><!-- latin small ligature oe, U+0153 ISOlat2 -->
+
+<!-- ligature is a misnomer, this is a separate character in some languages -->
+<!ENTITY Scaron  "&#352;" ><!-- latin capital letter S with caron, U+0160 ISOlat2 -->
+<!ENTITY scaron  "&#353;" ><!-- latin small letter s with caron, U+0161 ISOlat2 -->
+<!ENTITY Yuml    "&#376;" ><!-- latin capital letter Y with diaeresis, U+0178 ISOlat2 -->
+
+<!-- Spacing Modifier Letters -->
+<!ENTITY circ    "&#710;" ><!-- modifier letter circumflex accent, U+02C6 ISOpub -->
+<!ENTITY tilde   "&#732;" ><!-- small tilde, U+02DC ISOdia -->
+
+<!-- General Punctuation -->
+<!ENTITY ensp    "&#8194;" ><!-- en space, U+2002 ISOpub -->
+<!ENTITY emsp    "&#8195;" ><!-- em space, U+2003 ISOpub -->
+<!ENTITY thinsp  "&#8201;" ><!-- thin space, U+2009 ISOpub -->
+<!ENTITY zwnj    "&#8204;" ><!-- zero width non-joiner, U+200C NEW RFC 2070 -->
+<!ENTITY zwj     "&#8205;" ><!-- zero width joiner, U+200D NEW RFC 2070 -->
+<!ENTITY lrm     "&#8206;" ><!-- left-to-right mark, U+200E NEW RFC 2070 -->
+<!ENTITY rlm     "&#8207;" ><!-- right-to-left mark, U+200F NEW RFC 2070 -->
+<!ENTITY ndash   "&#8211;" ><!-- en dash, U+2013 ISOpub -->
+<!ENTITY mdash   "&#8212;" ><!-- em dash, U+2014 ISOpub -->
+<!ENTITY lsquo   "&#8216;" ><!-- left single quotation mark, U+2018 ISOnum -->
+<!ENTITY rsquo   "&#8217;" ><!-- right single quotation mark, U+2019 ISOnum -->
+<!ENTITY sbquo   "&#8218;" ><!-- single low-9 quotation mark, U+201A NEW -->
+<!ENTITY ldquo   "&#8220;" ><!-- left double quotation mark, U+201C ISOnum -->
+<!ENTITY rdquo   "&#8221;" ><!-- right double quotation mark, U+201D ISOnum -->
+<!ENTITY bdquo   "&#8222;" ><!-- double low-9 quotation mark, U+201E NEW -->
+<!ENTITY dagger  "&#8224;" ><!-- dagger, U+2020 ISOpub -->
+<!ENTITY Dagger  "&#8225;" ><!-- double dagger, U+2021 ISOpub -->
+<!ENTITY permil  "&#8240;" ><!-- per mille sign, U+2030 ISOtech -->
+
+<!-- lsaquo is proposed but not yet ISO standardized -->
+<!ENTITY lsaquo  "&#8249;" ><!-- single left-pointing angle quotation mark, U+2039 ISO proposed -->
+<!-- rsaquo is proposed but not yet ISO standardized -->
+<!ENTITY rsaquo  "&#8250;" ><!-- single right-pointing angle quotation mark, U+203A ISO proposed -->
+<!ENTITY euro    "&#8364;" ><!-- euro sign, U+20AC NEW -->
+
+<!-- end of xhtml-special.ent -->
+
+F.1.3. XHTML Mathematical, Greek, and Symbolic Characters
+
+You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-symbol.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent.
+
+<!-- ...................................................................... -->
+<!-- ISO Math, Greek and Symbolic Character Entity Set for XHTML .......... -->
+<!-- file: xhtml-symbol.ent
+
+     Typical invocation:
+
+       <!ENTITY % xhtml-symbol
+           PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
+                  "xhtml-symbol.ent" >
+       %xhtml-symbol;
+
+     This DTD module is identified by the PUBLIC and SYSTEM identifiers:
+
+       PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
+       SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent"
+
+     Revision:  $Id: xhtml-symbol.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
+
+     Portions (C) International Organization for Standardization 1986:
+     Permission to copy in any form is granted for use with conforming
+     SGML systems and applications as defined in ISO 8879, provided
+     this notice is included in all copies.
+-->
+
+<!-- Relevant ISO entity set is given unless names are newly introduced.
+     New names (i.e., not in ISO 8879 [SGML] list) do not clash with
+     any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
+     numbers are given for each character, in hex. Entity values are
+     decimal conversions of the ISO 10646 values and refer to the
+     document character set. Names are Unicode [UNICODE] names.
+-->
+
+<!-- Latin Extended-B -->
+<!ENTITY fnof     "&#402;" ><!-- latin small f with hook = function
+                              = florin, U+0192 ISOtech -->
+
+<!-- Greek -->
+<!ENTITY Alpha    "&#913;" ><!-- greek capital letter alpha, U+0391 -->
+<!ENTITY Beta     "&#914;" ><!-- greek capital letter beta, U+0392 -->
+<!ENTITY Gamma    "&#915;" ><!-- greek capital letter gamma, U+0393 ISOgrk3 -->
+<!ENTITY Delta    "&#916;" ><!-- greek capital letter delta, U+0394 ISOgrk3 -->
+<!ENTITY Epsilon  "&#917;" ><!-- greek capital letter epsilon, U+0395 -->
+<!ENTITY Zeta     "&#918;" ><!-- greek capital letter zeta, U+0396 -->
+<!ENTITY Eta      "&#919;" ><!-- greek capital letter eta, U+0397 -->
+<!ENTITY Theta    "&#920;" ><!-- greek capital letter theta, U+0398 ISOgrk3 -->
+<!ENTITY Iota     "&#921;" ><!-- greek capital letter iota, U+0399 -->
+<!ENTITY Kappa    "&#922;" ><!-- greek capital letter kappa, U+039A -->
+<!ENTITY Lambda   "&#923;" ><!-- greek capital letter lambda, U+039B ISOgrk3 -->
+<!ENTITY Mu       "&#924;" ><!-- greek capital letter mu, U+039C -->
+<!ENTITY Nu       "&#925;" ><!-- greek capital letter nu, U+039D -->
+<!ENTITY Xi       "&#926;" ><!-- greek capital letter xi, U+039E ISOgrk3 -->
+<!ENTITY Omicron  "&#927;" ><!-- greek capital letter omicron, U+039F -->
+<!ENTITY Pi       "&#928;" ><!-- greek capital letter pi, U+03A0 ISOgrk3 -->
+<!ENTITY Rho      "&#929;" ><!-- greek capital letter rho, U+03A1 -->
+<!-- there is no Sigmaf, and no U+03A2 character either -->
+<!ENTITY Sigma    "&#931;" ><!-- greek capital letter sigma, U+03A3 ISOgrk3 -->
+<!ENTITY Tau      "&#932;" ><!-- greek capital letter tau, U+03A4 -->
+<!ENTITY Upsilon  "&#933;" ><!-- greek capital letter upsilon,
+                              U+03A5 ISOgrk3 -->
+<!ENTITY Phi      "&#934;" ><!-- greek capital letter phi, U+03A6 ISOgrk3 -->
+<!ENTITY Chi      "&#935;" ><!-- greek capital letter chi, U+03A7 -->
+<!ENTITY Psi      "&#936;" ><!-- greek capital letter psi, U+03A8 ISOgrk3 -->
+<!ENTITY Omega    "&#937;" ><!-- greek capital letter omega, U+03A9 ISOgrk3 -->
+<!ENTITY alpha    "&#945;" ><!-- greek small letter alpha, U+03B1 ISOgrk3 -->
+<!ENTITY beta     "&#946;" ><!-- greek small letter beta, U+03B2 ISOgrk3 -->
+<!ENTITY gamma    "&#947;" ><!-- greek small letter gamma, U+03B3 ISOgrk3 -->
+<!ENTITY delta    "&#948;" ><!-- greek small letter delta, U+03B4 ISOgrk3 -->
+<!ENTITY epsilon  "&#949;" ><!-- greek small letter epsilon, U+03B5 ISOgrk3 -->
+<!ENTITY zeta     "&#950;" ><!-- greek small letter zeta, U+03B6 ISOgrk3 -->
+<!ENTITY eta      "&#951;" ><!-- greek small letter eta, U+03B7 ISOgrk3 -->
+<!ENTITY theta    "&#952;" ><!-- greek small letter theta, U+03B8 ISOgrk3 -->
+<!ENTITY iota     "&#953;" ><!-- greek small letter iota, U+03B9 ISOgrk3 -->
+<!ENTITY kappa    "&#954;" ><!-- greek small letter kappa, U+03BA ISOgrk3 -->
+<!ENTITY lambda   "&#955;" ><!-- greek small letter lambda, U+03BB ISOgrk3 -->
+<!ENTITY mu       "&#956;" ><!-- greek small letter mu, U+03BC ISOgrk3 -->
+<!ENTITY nu       "&#957;" ><!-- greek small letter nu, U+03BD ISOgrk3 -->
+<!ENTITY xi       "&#958;" ><!-- greek small letter xi, U+03BE ISOgrk3 -->
+<!ENTITY omicron  "&#959;" ><!-- greek small letter omicron, U+03BF NEW -->
+<!ENTITY pi       "&#960;" ><!-- greek small letter pi, U+03C0 ISOgrk3 -->
+<!ENTITY rho      "&#961;" ><!-- greek small letter rho, U+03C1 ISOgrk3 -->
+<!ENTITY sigmaf   "&#962;" ><!-- greek small letter final sigma, U+03C2 ISOgrk3 -->
+<!ENTITY sigma    "&#963;" ><!-- greek small letter sigma, U+03C3 ISOgrk3 -->
+<!ENTITY tau      "&#964;" ><!-- greek small letter tau, U+03C4 ISOgrk3 -->
+<!ENTITY upsilon  "&#965;" ><!-- greek small letter upsilon, U+03C5 ISOgrk3 -->
+<!ENTITY phi      "&#966;" ><!-- greek small letter phi, U+03C6 ISOgrk3 -->
+<!ENTITY chi      "&#967;" ><!-- greek small letter chi, U+03C7 ISOgrk3 -->
+<!ENTITY psi      "&#968;" ><!-- greek small letter psi, U+03C8 ISOgrk3 -->
+<!ENTITY omega    "&#969;" ><!-- greek small letter omega, U+03C9 ISOgrk3 -->
+<!ENTITY thetasym "&#977;" ><!-- greek small letter theta symbol, U+03D1 NEW -->
+<!ENTITY upsih    "&#978;" ><!-- greek upsilon with hook symbol, U+03D2 NEW -->
+<!ENTITY piv      "&#982;" ><!-- greek pi symbol, U+03D6 ISOgrk3 -->
+
+<!-- General Punctuation -->
+<!ENTITY bull     "&#8226;" ><!-- bullet = black small circle, U+2022 ISOpub  -->
+<!-- bullet is NOT the same as bullet operator, U+2219 -->
+<!ENTITY hellip   "&#8230;" ><!-- horizontal ellipsis = three dot leader, U+2026 ISOpub  -->
+<!ENTITY prime    "&#8242;" ><!-- prime = minutes = feet, U+2032 ISOtech -->
+<!ENTITY Prime    "&#8243;" ><!-- double prime = seconds = inches, U+2033 ISOtech -->
+<!ENTITY oline    "&#8254;" ><!-- overline = spacing overscore, U+203E NEW -->
+<!ENTITY frasl    "&#8260;" ><!-- fraction slash, U+2044 NEW -->
+
+<!-- Letterlike Symbols -->
+<!ENTITY weierp   "&#8472;" ><!-- script capital P = power set = Weierstrass p, U+2118 ISOamso -->
+<!ENTITY image    "&#8465;" ><!-- blackletter capital I = imaginary part, U+2111 ISOamso -->
+<!ENTITY real     "&#8476;" ><!-- blackletter capital R = real part symbol, U+211C ISOamso -->
+<!ENTITY trade    "&#8482;" ><!-- trade mark sign, U+2122 ISOnum -->
+<!ENTITY alefsym  "&#8501;" ><!-- alef symbol = first transfinite cardinal, U+2135 NEW -->
+<!-- alef symbol is NOT the same as hebrew letter alef, U+05D0 although
+     the same glyph could be used to depict both characters -->
+
+<!-- Arrows -->
+<!ENTITY larr     "&#8592;" ><!-- leftwards arrow, U+2190 ISOnum -->
+<!ENTITY uarr     "&#8593;" ><!-- upwards arrow, U+2191 ISOnum-->
+<!ENTITY rarr     "&#8594;" ><!-- rightwards arrow, U+2192 ISOnum -->
+<!ENTITY darr     "&#8595;" ><!-- downwards arrow, U+2193 ISOnum -->
+<!ENTITY harr     "&#8596;" ><!-- left right arrow, U+2194 ISOamsa -->
+<!ENTITY crarr    "&#8629;" ><!-- downwards arrow with corner leftwards
+                               = carriage return, U+21B5 NEW -->
+<!ENTITY lArr     "&#8656;" ><!-- leftwards double arrow, U+21D0 ISOtech -->
+<!-- Unicode does not say that lArr is the same as the 'is implied by' arrow
+    but also does not have any other character for that function. So ? lArr can
+    be used for 'is implied by' as ISOtech suggests -->
+<!ENTITY uArr     "&#8657;" ><!-- upwards double arrow, U+21D1 ISOamsa -->
+<!ENTITY rArr     "&#8658;" ><!-- rightwards double arrow, U+21D2 ISOtech -->
+<!-- Unicode does not say this is the 'implies' character but does not have
+     another character with this function so ?
+     rArr can be used for 'implies' as ISOtech suggests -->
+<!ENTITY dArr     "&#8659;" ><!-- downwards double arrow, U+21D3 ISOamsa -->
+<!ENTITY hArr     "&#8660;" ><!-- left right double arrow, U+21D4 ISOamsa -->
+
+<!-- Mathematical Operators -->
+<!ENTITY forall   "&#8704;" ><!-- for all, U+2200 ISOtech -->
+<!ENTITY part     "&#8706;" ><!-- partial differential, U+2202 ISOtech  -->
+<!ENTITY exist    "&#8707;" ><!-- there exists, U+2203 ISOtech -->
+<!ENTITY empty    "&#8709;" ><!-- empty set = null set, U+2205 ISOamso -->
+<!ENTITY nabla    "&#8711;" ><!-- nabla = backward difference, U+2207 ISOtech -->
+<!ENTITY isin     "&#8712;" ><!-- element of, U+2208 ISOtech -->
+<!ENTITY notin    "&#8713;" ><!-- not an element of, U+2209 ISOtech -->
+<!ENTITY ni       "&#8715;" ><!-- contains as member, U+220B ISOtech -->
+<!-- should there be a more memorable name than 'ni'? -->
+<!ENTITY prod     "&#8719;" ><!-- n-ary product = product sign, U+220F ISOamsb -->
+<!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though
+     the same glyph might be used for both -->
+<!ENTITY sum      "&#8721;" ><!-- n-ary sumation, U+2211 ISOamsb -->
+<!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
+     though the same glyph might be used for both -->
+<!ENTITY minus    "&#8722;" ><!-- minus sign, U+2212 ISOtech -->
+<!ENTITY lowast   "&#8727;" ><!-- asterisk operator, U+2217 ISOtech -->
+<!ENTITY radic    "&#8730;" ><!-- square root = radical sign, U+221A ISOtech -->
+<!ENTITY prop     "&#8733;" ><!-- proportional to, U+221D ISOtech -->
+<!ENTITY infin    "&#8734;" ><!-- infinity, U+221E ISOtech -->
+<!ENTITY ang      "&#8736;" ><!-- angle, U+2220 ISOamso -->
+<!ENTITY and      "&#8743;" ><!-- logical and = wedge, U+2227 ISOtech -->
+<!ENTITY or       "&#8744;" ><!-- logical or = vee, U+2228 ISOtech -->
+<!ENTITY cap      "&#8745;" ><!-- intersection = cap, U+2229 ISOtech -->
+<!ENTITY cup      "&#8746;" ><!-- union = cup, U+222A ISOtech -->
+<!ENTITY int      "&#8747;" ><!-- integral, U+222B ISOtech -->
+<!ENTITY there4   "&#8756;" ><!-- therefore, U+2234 ISOtech -->
+<!ENTITY sim      "&#8764;" ><!-- tilde operator = varies with = similar to, U+223C ISOtech -->
+<!-- tilde operator is NOT the same character as the tilde, U+007E,
+     although the same glyph might be used to represent both  -->
+<!ENTITY cong     "&#8773;" ><!-- approximately equal to, U+2245 ISOtech -->
+<!ENTITY asymp    "&#8776;" ><!-- almost equal to = asymptotic to, U+2248 ISOamsr -->
+<!ENTITY ne       "&#8800;" ><!-- not equal to, U+2260 ISOtech -->
+<!ENTITY equiv    "&#8801;" ><!-- identical to, U+2261 ISOtech -->
+<!ENTITY le       "&#8804;" ><!-- less-than or equal to, U+2264 ISOtech -->
+<!ENTITY ge       "&#8805;" ><!-- greater-than or equal to, U+2265 ISOtech -->
+<!ENTITY sub      "&#8834;" ><!-- subset of, U+2282 ISOtech -->
+<!ENTITY sup      "&#8835;" ><!-- superset of, U+2283 ISOtech -->
+<!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol
+     font encoding and is not included. Should it be, for symmetry?
+     It is in ISOamsn  -->
+<!ENTITY nsub     "&#8836;" ><!-- not a subset of, U+2284 ISOamsn -->
+<!ENTITY sube     "&#8838;" ><!-- subset of or equal to, U+2286 ISOtech -->
+<!ENTITY supe     "&#8839;" ><!-- superset of or equal to, U+2287 ISOtech -->
+<!ENTITY oplus    "&#8853;" ><!-- circled plus = direct sum, U+2295 ISOamsb -->
+<!ENTITY otimes   "&#8855;" ><!-- circled times = vector product, U+2297 ISOamsb -->
+<!ENTITY perp     "&#8869;" ><!-- up tack = orthogonal to = perpendicular, U+22A5 ISOtech -->
+<!ENTITY sdot     "&#8901;" ><!-- dot operator, U+22C5 ISOamsb -->
+<!-- dot operator is NOT the same character as U+00B7 middle dot -->
+
+<!-- Miscellaneous Technical -->
+<!ENTITY lceil    "&#8968;" ><!-- left ceiling = apl upstile, U+2308 ISOamsc  -->
+<!ENTITY rceil    "&#8969;" ><!-- right ceiling, U+2309 ISOamsc  -->
+<!ENTITY lfloor   "&#8970;" ><!-- left floor = apl downstile, U+230A ISOamsc  -->
+<!ENTITY rfloor   "&#8971;" ><!-- right floor, U+230B ISOamsc  -->
+<!ENTITY lang     "&#9001;" ><!-- left-pointing angle bracket = bra, U+2329 ISOtech -->
+<!-- lang is NOT the same character as U+003C 'less than'
+     or U+2039 'single left-pointing angle quotation mark' -->
+<!ENTITY rang     "&#9002;" ><!-- right-pointing angle bracket = ket, U+232A ISOtech -->
+<!-- rang is NOT the same character as U+003E 'greater than'
+     or U+203A 'single right-pointing angle quotation mark' -->
+
+<!-- Geometric Shapes -->
+<!ENTITY loz      "&#9674;" ><!-- lozenge, U+25CA ISOpub -->
+
+<!-- Miscellaneous Symbols -->
+<!ENTITY spades   "&#9824;" ><!-- black spade suit, U+2660 ISOpub -->
+<!-- black here seems to mean filled as opposed to hollow -->
+<!ENTITY clubs    "&#9827;" ><!-- black club suit = shamrock, U+2663 ISOpub -->
+<!ENTITY hearts   "&#9829;" ><!-- black heart suit = valentine, U+2665 ISOpub -->
+<!ENTITY diams    "&#9830;" ><!-- black diamond suit, U+2666 ISOpub -->
+
+<!-- end of xhtml-symbol.ent -->
+"""
+  return text
+
+def get_apache_license():
+  license = r"""/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+"""
+  return license
+
+main()
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/package.html
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/package.html
@ -17,6 +17,42 @@
 -->
 <html><head></head>
 <body>
-Filters that normalize text before tokenization.
+<p>
+  Chainable filters that normalize text before tokenization and provide
+  mappings between normalized text offsets and the corresponding offset
+  in the original text.
+</p>
+<H2>CharFilter offset mappings</H2>
+<p>
+  CharFilters modify an input stream via a series of substring
+  replacements (including deletions and insertions) to produce an output
+  stream. There are three possible replacement cases: the replacement
+  string has the same length as the original substring; the replacement
+  is shorter; and the replacement is longer. In the latter two cases
+  (when the replacement has a different length than the original),
+  one or more offset correction mappings are required.
+</p>
+<p>
+  When the replacement is shorter than the original (e.g. when the
+  replacement is the empty string), a single offset correction mapping
+  should be added at the replacement's end offset in the output stream.
+  The <code>cumulativeDiff</code> parameter to the
+  <code>addOffCorrectMapping()</code> method will be the sum of all
+  previous replacement offset adjustments, with the addition of the
+  difference between the lengths of the original substring and the
+  replacement string (a positive value).
+</p>
+<p>
+  When the replacement is longer than the original (e.g. when the
+  original is the empty string), you should add as many offset
+  correction mappings as the difference between the lengths of the
+  replacement string and the original substring, starting at the
+  end offset the original substring would have had in the output stream.
+  The <code>cumulativeDiff</code> parameter to the
+  <code>addOffCorrectMapping()</code> method will be the sum of all
+  previous replacement offset adjustments, with the addition of the
+  difference between the lengths of the original substring and the
+  replacement string so far (a negative value).
+</p>
 </body>
 </html>
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
@ -154,13 +154,22 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {

    /** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. */
    public CompoundToken(int offset, int length) {
-      final int newStart = CompoundWordTokenFilterBase.this.offsetAtt.startOffset() + offset;
      this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length);
-      // TODO: This ignores the original endOffset, if a CharFilter/Tokenizer/Filter removed
-      // chars from the term, offsets may not match correctly (other filters producing tokens
-      // may also have this problem):
-      this.startOffset = newStart;
-      this.endOffset = newStart + length;
+      
+      // offsets of the original word
+      int startOff = CompoundWordTokenFilterBase.this.offsetAtt.startOffset();
+      int endOff = CompoundWordTokenFilterBase.this.offsetAtt.endOffset();
+      
+      if (endOff - startOff != CompoundWordTokenFilterBase.this.termAtt.length()) {
+        // if length by start + end offsets doesn't match the term text then assume
+        // this is a synonym and don't adjust the offsets.
+        this.startOffset = startOff;
+        this.endOffset = endOff;
+      } else {
+        final int newStart = startOff + offset;
+        this.startOffset = newStart;
+        this.endOffset = newStart + length;
+      }
    }

  }  
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilter.java
@ -0,0 +1,47 @@
+package org.apache.lucene.analysis.core;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.analysis.util.FilteringTokenFilter;
+
+import java.io.IOException;
+import java.util.Set;
+
+/**
+ * Removes tokens whose types appear in a set of blocked types from a token stream.
+ */
+public final class TypeTokenFilter extends FilteringTokenFilter {
+
+  private final Set<String> stopTypes;
+  private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
+
+  public TypeTokenFilter(boolean enablePositionIncrements, TokenStream input, Set<String> stopTypes) {
+    super(enablePositionIncrements, input);
+    this.stopTypes = stopTypes;
+  }
+
+  /**
+   * Returns the next input Token whose typeAttribute.type() is not a stop type.
+   */
+  @Override
+  protected boolean accept() throws IOException {
+    return !stopTypes.contains(typeAttribute.type());
+  }
+}
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilter.java
@ -60,6 +60,7 @@ public final class HyphenatedWordsFilter extends TokenFilter {
  private final StringBuilder hyphenated = new StringBuilder();
  private State savedState;
  private boolean exhausted = false;
+  private int lastEndOffset = 0;

  /**
   * Creates a new HyphenatedWordsFilter
@ -78,6 +79,7 @@ public final class HyphenatedWordsFilter extends TokenFilter {
    while (!exhausted && input.incrementToken()) {
      char[] term = termAttribute.buffer();
      int termLength = termAttribute.length();
+      lastEndOffset = offsetAttribute.endOffset();
      
      if (termLength > 0 && term[termLength - 1] == '-') {
        // a hyphenated word
@ -119,6 +121,7 @@ public final class HyphenatedWordsFilter extends TokenFilter {
    hyphenated.setLength(0);
    savedState = null;
    exhausted = false;
+    lastEndOffset = 0;
  }

  // ================================================= Helper Methods ================================================
@ -127,8 +130,6 @@ public final class HyphenatedWordsFilter extends TokenFilter {
   * Writes the joined unhyphenated term
   */
  private void unhyphenate() {
-    int endOffset = offsetAttribute.endOffset();
-    
    restoreState(savedState);
    savedState = null;
    
@ -140,7 +141,7 @@ public final class HyphenatedWordsFilter extends TokenFilter {
    
    hyphenated.getChars(0, length, term, 0);
    termAttribute.setLength(length);
-    offsetAttribute.setOffset(offsetAttribute.startOffset(), endOffset);
+    offsetAttribute.setOffset(offsetAttribute.startOffset(), lastEndOffset);
    hyphenated.setLength(0);
  }
 }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PatternAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/PatternAnalyzer.java
@ -183,31 +183,33 @@ public final class PatternAnalyzer extends Analyzer {
   * 
   * @param fieldName
   *            the name of the field to tokenize (currently ignored).
+   * @param reader
+   *            reader (e.g. charfilter) of the original text. can be null.
   * @param text
   *            the string to tokenize
   * @return a new token stream
   */
-  public TokenStreamComponents createComponents(String fieldName, String text) {
+  public TokenStreamComponents createComponents(String fieldName, Reader reader, String text) {
    // Ideally the Analyzer superclass should have a method with the same signature, 
    // with a default impl that simply delegates to the StringReader flavour. 
    if (text == null) 
      throw new IllegalArgumentException("text must not be null");
    
    if (pattern == NON_WORD_PATTERN) { // fast path
-      return new TokenStreamComponents(new FastStringTokenizer(text, true, toLowerCase, stopWords));
+      return new TokenStreamComponents(new FastStringTokenizer(reader, text, true, toLowerCase, stopWords));
    } else if (pattern == WHITESPACE_PATTERN) { // fast path
-      return new TokenStreamComponents(new FastStringTokenizer(text, false, toLowerCase, stopWords));
+      return new TokenStreamComponents(new FastStringTokenizer(reader, text, false, toLowerCase, stopWords));
    }

-    Tokenizer tokenizer = new PatternTokenizer(text, pattern, toLowerCase);
+    Tokenizer tokenizer = new PatternTokenizer(reader, text, pattern, toLowerCase);
    TokenStream result = (stopWords != null) ? new StopFilter(matchVersion, tokenizer, stopWords) : tokenizer;
    return new TokenStreamComponents(tokenizer, result);
  }
  
  /**
   * Creates a token stream that tokenizes all the text in the given Reader;
-   * This implementation forwards to <code>tokenStream(String, String)</code> and is
-   * less efficient than <code>tokenStream(String, String)</code>.
+   * This implementation forwards to <code>tokenStream(String, Reader, String)</code> and is
+   * less efficient than <code>tokenStream(String, Reader, String)</code>.
   * 
   * @param fieldName
   *            the name of the field to tokenize (currently ignored).
@ -219,7 +221,7 @@ public final class PatternAnalyzer extends Analyzer {
  public TokenStreamComponents createComponents(String fieldName, Reader reader) {
    try {
      String text = toString(reader);
-      return createComponents(fieldName, text);
+      return createComponents(fieldName, reader, text);
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
@ -332,7 +334,8 @@ public final class PatternAnalyzer extends Analyzer {
    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
    
-    public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) {
+    public PatternTokenizer(Reader input, String str, Pattern pattern, boolean toLowerCase) {
+      super(input);
      this.pattern = pattern;
      this.str = str;
      this.matcher = pattern.matcher(str);
@ -359,7 +362,7 @@ public final class PatternAnalyzer extends Analyzer {
          String text = str.substring(start, end);
          if (toLowerCase) text = text.toLowerCase(locale);
          termAtt.setEmpty().append(text);
-          offsetAtt.setOffset(start, end);
+          offsetAtt.setOffset(correctOffset(start), correctOffset(end));
          return true;
        }
        if (!isMatch) return false;
@ -369,7 +372,7 @@ public final class PatternAnalyzer extends Analyzer {
    @Override
    public final void end() {
      // set final offset
-      final int finalOffset = str.length();
+      final int finalOffset = correctOffset(str.length());
    	this.offsetAtt.setOffset(finalOffset, finalOffset);
    }

@ -406,7 +409,8 @@ public final class PatternAnalyzer extends Analyzer {
    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
    
-    public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set<?> stopWords) {
+    public FastStringTokenizer(Reader input, String str, boolean isLetter, boolean toLowerCase, Set<?> stopWords) {
+      super(input);
      this.str = str;
      this.isLetter = isLetter;
      this.toLowerCase = toLowerCase;
@ -458,7 +462,7 @@ public final class PatternAnalyzer extends Analyzer {
        return false;
      }
      termAtt.setEmpty().append(text);
-      offsetAtt.setOffset(start, i);
+      offsetAtt.setOffset(correctOffset(start), correctOffset(i));
      return true;
    }
    
@ -466,7 +470,7 @@ public final class PatternAnalyzer extends Analyzer {
    public final void end() {
      // set final offset
      final int finalOffset = str.length();
-      this.offsetAtt.setOffset(finalOffset, finalOffset);
+      this.offsetAtt.setOffset(correctOffset(finalOffset), correctOffset(finalOffset));
    }    
    
    private boolean isTokenChar(char c, boolean isLetter) {
@ -479,6 +483,7 @@ public final class PatternAnalyzer extends Analyzer {

    @Override
    public void reset(Reader input) throws IOException {
+      super.reset(input);
      this.str = PatternAnalyzer.toString(input);
    }

--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilter.java
@ -68,7 +68,7 @@ public final class TrimFilter extends TokenFilter {
      } else {
        termAtt.setEmpty();
      }
-      if (updateOffsets) {
+      if (updateOffsets && len == offsetAtt.endOffset() - offsetAtt.startOffset()) {
        int newStart = offsetAtt.startOffset()+start;
        int newEnd = offsetAtt.endOffset() - (start<end ? endOff:0);
        offsetAtt.setOffset(newStart, newEnd);
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
@ -405,10 +405,20 @@ public final class WordDelimiterFilter extends TokenFilter {
    clearAttributes();
    termAttribute.copyBuffer(savedBuffer, iterator.current, iterator.end - iterator.current);

-    int startOffSet = (isSingleWord || !hasIllegalOffsets) ? savedStartOffset + iterator.current : savedStartOffset;
-    int endOffSet = (hasIllegalOffsets) ? savedEndOffset : savedStartOffset + iterator.end;
-
-    offsetAttribute.setOffset(startOffSet, endOffSet);
+    int startOffset = savedStartOffset + iterator.current;
+    int endOffset = savedStartOffset + iterator.end;
+    
+    if (hasIllegalOffsets) {
+      // historically this filter did this regardless for 'isSingleWord', 
+      // but we must do a sanity check:
+      if (isSingleWord && startOffset <= savedEndOffset) {
+        offsetAttribute.setOffset(startOffset, savedEndOffset);
+      } else {
+        offsetAttribute.setOffset(savedStartOffset, savedEndOffset);
+      }
+    } else {
+      offsetAttribute.setOffset(startOffset, endOffset);
+    }
    posIncAttribute.setPositionIncrement(position(false));
    typeAttribute.setType(savedType);
  }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
@ -74,7 +74,8 @@ public final class EdgeNGramTokenizer extends Tokenizer {
  private int gramSize;
  private Side side;
  private boolean started = false;
-  private int inLen;
+  private int inLen; // length of the input AFTER trim()
+  private int charsRead; // length of the input
  private String inStr;


@ -183,7 +184,11 @@ public final class EdgeNGramTokenizer extends Tokenizer {
    if (!started) {
      started = true;
      char[] chars = new char[1024];
-      int charsRead = input.read(chars);
+      charsRead = input.read(chars);
+      if (charsRead < 0) {
+        charsRead = inLen = 0;
+        return false;
+      }
      inStr = new String(chars, 0, charsRead).trim();  // remove any leading or trailing spaces
      inLen = inStr.length();
      gramSize = minGram;
@ -211,7 +216,7 @@ public final class EdgeNGramTokenizer extends Tokenizer {
  @Override
  public final void end() {
    // set final offset
-    final int finalOffset = inLen;
+    final int finalOffset = correctOffset(charsRead);
    this.offsetAtt.setOffset(finalOffset, finalOffset);
  }    

@ -225,5 +230,6 @@ public final class EdgeNGramTokenizer extends Tokenizer {
  public void reset() throws IOException {
    super.reset();
    started = false;
+    charsRead = 0;
  }
 }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
@ -35,7 +35,8 @@ public final class NGramTokenizer extends Tokenizer {
  private int minGram, maxGram;
  private int gramSize;
  private int pos = 0;
-  private int inLen;
+  private int inLen; // length of the input AFTER trim()
+  private int charsRead; // length of the input
  private String inStr;
  private boolean started = false;
  
@ -104,7 +105,11 @@ public final class NGramTokenizer extends Tokenizer {
      started = true;
      gramSize = minGram;
      char[] chars = new char[1024];
-      input.read(chars);
+      charsRead = input.read(chars);
+      if (charsRead < 0) {
+        charsRead = inLen = 0;
+        return false;
+      }
      inStr = new String(chars).trim();  // remove any trailing empty strings 
      inLen = inStr.length();
    }
@ -128,7 +133,7 @@ public final class NGramTokenizer extends Tokenizer {
  @Override
  public final void end() {
    // set final offset
-    final int finalOffset = inLen;
+    final int finalOffset = correctOffset(charsRead);
    this.offsetAtt.setOffset(finalOffset, finalOffset);
  }    
  
@ -143,5 +148,6 @@ public final class NGramTokenizer extends Tokenizer {
    super.reset();
    started = false;
    pos = 0;
+    charsRead = 0;
  }
 }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 public final class PositionFilter extends TokenFilter {

  /** Position increment to assign to all but the first token - default = 0 */
-  private int positionIncrement = 0;
+  private final int positionIncrement;
  
  /** The first token must have non-zero positionIncrement **/
  private boolean firstTokenPositioned = false;
@ -44,7 +44,7 @@ public final class PositionFilter extends TokenFilter {
   * @param input the input stream
   */
  public PositionFilter(final TokenStream input) {
-    super(input);
+    this(input, 0);
  }

  /**
@ -56,7 +56,7 @@ public final class PositionFilter extends TokenFilter {
   *  token from the input stream
   */
  public PositionFilter(final TokenStream input, final int positionIncrement) {
-    this(input);
+    super(input);
    this.positionIncrement = positionIncrement;
  }

--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
@ -68,6 +68,7 @@ public final class ThaiWordFilter extends TokenFilter {
  private CharTermAttribute clonedTermAtt = null;
  private OffsetAttribute clonedOffsetAtt = null;
  private boolean hasMoreTokensInClone = false;
+  private boolean hasIllegalOffsets = false; // only if the length changed before this filter

  /** Creates a new ThaiWordFilter with the specified match version. */
  public ThaiWordFilter(Version matchVersion, TokenStream input) {
@ -86,7 +87,11 @@ public final class ThaiWordFilter extends TokenFilter {
      if (end != BreakIterator.DONE) {
        clonedToken.copyTo(this);
        termAtt.copyBuffer(clonedTermAtt.buffer(), start, end - start);
-        offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end);
+        if (hasIllegalOffsets) {
+          offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
+        } else {
+          offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end);
+        }
        if (handlePosIncr) posAtt.setPositionIncrement(1);
        return true;
      }
@ -102,6 +107,10 @@ public final class ThaiWordFilter extends TokenFilter {
    }
    
    hasMoreTokensInClone = true;
+    
+    // if length by start + end offsets doesn't match the term text then assume
+    // this is a synonym and don't adjust the offsets.
+    hasIllegalOffsets = offsetAtt.endOffset() - offsetAtt.startOffset() != termAtt.length();

    // we lazy init the cloned token, as in ctor not all attributes may be added
    if (clonedToken == null) {
@ -118,7 +127,11 @@ public final class ThaiWordFilter extends TokenFilter {
    int end = breaker.next();
    if (end != BreakIterator.DONE) {
      termAtt.setLength(end);
-      offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end);
+      if (hasIllegalOffsets) {
+        offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
+      } else {
+        offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end);
+      }
      // position increment keeps as it is for first token
      return true;
    }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java
@ -306,13 +306,14 @@ public final class WikipediaTokenizer extends Tokenizer {
  @Override
  public void reset() throws IOException {
    super.reset();
-    scanner.yyreset(input);
+    tokens = null;
+    scanner.reset();
  }

  @Override
  public void reset(Reader reader) throws IOException {
    super.reset(reader);
-    reset();
+    scanner.yyreset(input);
  }

  @Override
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java
@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/30/11 12:11 PM */
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 1/22/12 10:26 PM */

 package org.apache.lucene.analysis.wikipedia;

@ -25,8 +25,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 /**
 * This class is a scanner generated by 
 * <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
- * on 9/30/11 12:11 PM from the specification file
- * <tt>/lucene/jflex/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
+ * on 1/22/12 10:26 PM from the specification file
+ * <tt>/home/rmuir/workspace/lucene-clean-trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
 */
 class WikipediaTokenizerImpl {

@ -498,6 +498,14 @@ final int setText(StringBuilder buffer){
  return length;
 }

+final void reset() {
+  currentTokType = 0;
+  numBalanced = 0;
+  positionInc = 1;
+  numLinkToks = 0;
+  numWikiTokensSeen = 0;
+}
+



--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex
@ -91,6 +91,14 @@ final int setText(StringBuilder buffer){
  return length;
 }

+final void reset() {
+  currentTokType = 0;
+  numBalanced = 0;
+  positionInc = 1;
+  numLinkToks = 0;
+  numWikiTokensSeen = 0;
+}
+

 %}

--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
@ -23,6 +23,7 @@ import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.StringReader;
+import java.util.Arrays;
 import java.util.HashSet;
 import java.util.Set;

@ -31,7 +32,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CharReader;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Tokenizer;
-import org.junit.Ignore;
+import org.apache.lucene.util._TestUtil;

 public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {

@ -41,9 +42,9 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
    String html = "<div class=\"foo\">this is some text</div> here is a <a href=\"#bar\">link</a> and " +
            "another <a href=\"http://lucene.apache.org/\">link</a>. " +
            "This is an entity: &amp; plus a &lt;.  Here is an &. <!-- is a comment -->";
-    String gold = " this is some text  here is a  link  and " +
-            "another  link . " +
-            "This is an entity: & plus a <.  Here is an &.  ";
+    String gold = "\nthis is some text\n here is a link and " +
+            "another link. " +
+            "This is an entity: & plus a <.  Here is an &. ";
    HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new StringReader(html)));
    StringBuilder builder = new StringBuilder();
    int ch = -1;
@ -56,7 +57,8 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
              + " Buffer so far: " + builder + "<EOB>", theChar == goldArray[position]);
      position++;
    }
-    assertEquals(gold, builder.toString());
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
  }

  //Some sanity checks, but not a full-fledged check
@ -77,6 +79,24 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
    
  }

+  public void testMSWord14GeneratedHTML() throws Exception {
+    InputStream stream = getClass().getResourceAsStream("MS-Word 14 generated.htm");
+    HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new InputStreamReader(stream, "UTF-8")));
+    String gold = "This is a test";
+    StringBuilder builder = new StringBuilder();
+    int ch = 0;
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString().trim() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString().trim());
+  }
+  
+  
  public void testGamma() throws Exception {
    String test = "&Gamma;";
    String gold = "\u0393";
@ -89,9 +109,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
      builder.append((char)ch);
    }
    String result = builder.toString();
-    // System.out.println("Resu: " + result + "<EOL>");
-    // System.out.println("Gold: " + gold + "<EOL>");
-    assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
+    assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
  }

  public void testEntities() throws Exception {
@ -106,9 +124,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
      builder.append((char)ch);
    }
    String result = builder.toString();
-    // System.out.println("Resu: " + result + "<EOL>");
-    // System.out.println("Gold: " + gold + "<EOL>");
-    assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
+    assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
  }

  public void testMoreEntities() throws Exception {
@ -123,9 +139,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
      builder.append((char)ch);
    }
    String result = builder.toString();
-    // System.out.println("Resu: " + result + "<EOL>");
-    // System.out.println("Gold: " + gold + "<EOL>");
-    assertTrue(result + " is not equal to " + gold, result.equals(gold) == true);
+    assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
  }

  public void testReserved() throws Exception {
@ -147,45 +161,248 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
  }

  public void testMalformedHTML() throws Exception {
-    String test = "a <a hr<ef=aa<a>> </close</a>";
-    String gold = "a <a hr<ef=aa > </close ";
-    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
-    StringBuilder builder = new StringBuilder();
-    int ch = 0;
-    while ((ch = reader.read()) != -1){
-      builder.append((char)ch);
+    String[] testGold = {
+        "a <a hr<ef=aa<a>> </close</a>",
+        "a <a hr<ef=aa> </close",
+
+        "<a href=http://dmoz.org/cgi-bin/add.cgi?where=/arts/\" class=lu style=\"font-size: 9px\" target=dmoz>Submit a Site</a>",
+        "Submit a Site",
+
+        "<a href=javascript:ioSwitch('p8','http://www.csmonitor.com/') title=expand id=e8 class=expanded rel=http://www.csmonitor.com/>Christian Science",
+        "Christian Science",
+
+        "<link rel=\"alternate\" type=\"application/rss+xml\" title=\"San Francisco \" 2008 RSS Feed\" href=\"http://2008.sf.wordcamp.org/feed/\" />",
+        "\n",
+
+        // "<" before ">" inhibits tag recognition
+        "<a href=\" http://www.surgery4was.happyhost.org/video-of-arthroscopic-knee-surgery symptoms.html, heat congestive heart failure <a href=\" http://www.symptoms1bad.happyhost.org/canine",
+        "<a href=\" http://www.surgery4was.happyhost.org/video-of-arthroscopic-knee-surgery symptoms.html, heat congestive heart failure <a href=\" http://www.symptoms1bad.happyhost.org/canine",
+
+        "<a href=\"http://ucblibraries.colorado.edu/how/index.htm\"class=\"pageNavAreaText\">",
+        "",
+
+        "<link title=\"^\\\" 21Sta's Blog\" rel=\"search\"  type=\"application/opensearchdescription+xml\"  href=\"http://21sta.com/blog/inc/opensearch.php\" />",
+        "\n",
+
+        "<a href=\"#postcomment\" title=\"\"Leave a comment\";\">?",
+        "?",
+
+        "<a href='/modern-furniture'   ' id='21txt' class='offtab'   onMouseout=\"this.className='offtab';  return true;\" onMouseover=\"this.className='ontab';  return true;\">",
+        "",
+
+        "<a href='http://alievi.wordpress.com/category/01-todos-posts/' style='font-size: 275%; padding: 1px; margin: 1px;' title='01 - Todos Post's (83)'>",
+        "",
+
+        "The <a href=<a href=\"http://www.advancedmd.com>medical\">http://www.advancedmd.com>medical</a> practice software</a>",
+        "The <a href=medical\">http://www.advancedmd.com>medical practice software",
+
+        "<a href=\"node/21426\" class=\"clipTitle2\" title=\"Levi.com/BMX 2008 Clip of the Week 29 \"Morgan Wade Leftover Clips\"\">Levi.com/BMX 2008 Clip of the Week 29...",
+        "Levi.com/BMX 2008 Clip of the Week 29...",
+
+        "<a href=\"printer_friendly.php?branch=&year=&submit=go&screen=\";\">Printer Friendly",
+        "Printer Friendly",
+
+        "<a href=#\" ondragstart=\"return false\" onclick=\"window.external.AddFavorite('http://www.amazingtextures.com', 'Amazing Textures');return false\" onmouseover=\"window.status='Add to Favorites';return true\">Add to Favorites",
+        "Add to Favorites",
+
+        "<a href=\"../at_home/at_home_search.html\"../_home/at_home_search.html\">At",
+        "At",
+
+        "E-mail: <a href=\"\"mailto:XXXXXX@example.com\" \">XXXXXX@example.com </a>",
+        "E-mail: XXXXXX@example.com ",
+
+        "<li class=\"farsi\"><a title=\"A'13?\" alt=\"A'13?\" href=\"http://www.america.gov/persian\" alt=\"\" name=\"A'13?\"A'13? title=\"A'13?\">A'13?</a></li>",
+        "\nA'13?\n",
+
+        "<li><a href=\"#28\" title=\"Hubert \"Geese\" Ausby\">Hubert \"Geese\" Ausby</a></li>",
+        "\nHubert \"Geese\" Ausby\n",
+
+        "<href=\"http://anbportal.com/mms/login.asp\">",
+        "\n",
+
+        "<a href=\"",
+        "<a href=\"",
+
+        "<a href=\">",
+        "",
+
+        "<a rel=\"nofollow\" href=\"http://anissanina31.skyrock.com/1895039493-Hi-tout-le-monde.html\" title=\" Hi, tout le monde !>#</a>",
+        "#",
+
+        "<a href=\"http://annunciharleydavidsonusate.myblog.it/\" title=\"Annunci Moto e Accessori Harley Davidson\" target=\"_blank\"><img src=\"http://annunciharleydavidsonusate.myblog.it/images/Antipixel.gif\" /></a>",
+        "",
+
+        "<a href=\"video/addvideo&v=120838887181\" onClick=\"return confirm('Are you sure you want  add this video to your profile? If it exists some video in your profile will be overlapped by this video!!')\" \" onmouseover=\"this.className='border2'\" onmouseout=\"this.className=''\">",
+        "",
+
+        "<a href=#Services & Support>",
+        "",
+
+        // "<" and ">" chars are accepted in on[Event] attribute values
+        "<input type=\"image\" src=\"http://apologyindex.com/ThemeFiles/83401-72905/images/btn_search.gif\"value=\"Search\" name=\"Search\" alt=\"Search\" class=\"searchimage\" onclick=\"incom ='&sc=' + document.getElementById('sel').value ; var dt ='&dt=' + document.getElementById('dt').value; var searchKeyword = document.getElementById('q').value ; searchKeyword = searchKeyword.replace(/\\s/g,''); if (searchKeyword.length < 3){alert('Nothing to search. Search keyword should contain atleast 3 chars.'); return false; } var al='&al=' +  document.getElementById('advancedlink').style.display ;  document.location.href='http://apologyindex.com/search.aspx?q=' + document.getElementById('q').value + incom + dt + al;\" />",
+        "",
+
+        "<input type=\"image\" src=\"images/afbe.gif\" width=\"22\" height=\"22\"  hspace=\"4\" title=\"Add to Favorite\" alt=\"Add to Favorite\"onClick=\" if(window.sidebar){ window.sidebar.addPanel(document.title,location.href,''); }else if(window.external){ window.external.AddFavorite(location.href,document.title); }else if(window.opera&&window.print) { return true; }\">",
+        "",
+
+        "<area shape=\"rect\" coords=\"12,153,115,305\" href=\"http://statenislandtalk.com/v-web/gallery/Osmundsen-family\"Art's Norwegian Roots in Rogaland\">",
+        "\n",
+
+        "<a rel=\"nofollow\" href=\"http://arth26.skyrock.com/660188240-bonzai.html\" title=\"bonza>#",
+        "#",
+
+        "<a href=  >",
+        "",
+
+        "<ahref=http:..",
+        "<ahref=http:..",
+
+        "<ahref=http:..>",
+        "\n",
+
+        "<ahref=\"http://aseigo.bddf.ca/cms/1025\">A",
+        "\nA",
+
+        "<a href=\"javascript:calendar_window=window.open('/calendar.aspx?formname=frmCalendar.txtDate','calendar_window','width=154,height=188');calendar_window.focus()\">",
+        "",
+
+        "<a href=\"/applications/defenseaerospace/19+rackmounts\" title=\"19\" Rackmounts\">",
+        "",
+
+        "<a href=http://www.azimprimerie.fr/flash/backup/lewes-zip-code/savage-model-110-manual.html title=savage model 110 manual rel=dofollow>",
+        "",
+
+        "<a class=\"at\" name=\"Lamborghini  href=\"http://lamborghini.coolbegin.com\">Lamborghini /a>",
+        "Lamborghini /a>",
+
+        "<A href='newslink.php?news_link=http%3A%2F%2Fwww.worldnetdaily.com%2Findex.php%3Ffa%3DPAGE.view%26pageId%3D85729&news_title=Florida QB makes 'John 3:16' hottest Google search Tebow inscribed Bible reference on eye black for championship game' TARGET=_blank>",
+        "",
+
+        "<a href=/myspace !style='color:#993333'>",
+        "",
+
+        "<meta name=3DProgId content=3DExcel.Sheet>",
+        "\n",
+
+        "<link id=3D\"shLink\" href=3D\"PSABrKelly-BADMINTONCupResults08FINAL2008_09_19=_files/sheet004.htm\">",
+        "\n",
+
+        "<td bgcolor=3D\"#FFFFFF\" nowrap>",
+        "\n",
+
+        "<a href=\"http://basnect.info/usersearch/\"predicciones-mundiales-2009\".html\">\"predicciones mundiales 2009\"</a>",
+        "\"predicciones mundiales 2009\"",
+
+        "<a class=\"comment-link\" href=\"https://www.blogger.com/comment.g?blogID=19402125&postID=114070605958684588\"location.href=https://www.blogger.com/comment.g?blogID=19402125&postID=114070605958684588;>",
+        "",
+
+        "<a href = \"/videos/Bishop\"/\" title = \"click to see more Bishop\" videos\">Bishop\"</a>",
+        "Bishop\"",
+
+        "<a href=\"http://bhaa.ie/calendar/event.php?eid=20081203150127531\"\">BHAA Eircom 2 &amp; 5 miles CC combined start</a>",
+        "BHAA Eircom 2 & 5 miles CC combined start",
+
+        "<a href=\"http://people.tribe.net/wolfmana\" onClick='setClick(\"Application[tribe].Person[bb7df210-9dc0-478c-917f-436b896bcb79]\")'\" title=\"Mana\">",
+        "",
+
+        "<a  href=\"http://blog.edu-cyberpg.com/ct.ashx?id=6143c528-080c-4bb2-b765-5ec56c8256d3&url=http%3a%2f%2fwww.gsa.ac.uk%2fmackintoshsketchbook%2f\"\" eudora=\"autourl\">",
+        "",
+
+        // "<" before ">" inhibits tag recognition
+        "<input type=\"text\" value=\"<search here>\">",
+        "<input type=\"text\" value=\"\n\">",
+
+        "<input type=\"text\" value=\"<search here\">",
+        "<input type=\"text\" value=\"\n",
+
+        "<input type=\"text\" value=\"search here>\">",
+        "\">",
+
+        // "<" and ">" chars are accepted in on[Event] attribute values
+        "<input type=\"text\" value=\"&lt;search here&gt;\" onFocus=\"this.value='<search here>'\">",
+        "",
+
+        "<![if ! IE]>\n<link href=\"http://i.deviantart.com/icons/favicon.png\" rel=\"shortcut icon\"/>\n<![endif]>",
+        "\n\n\n",
+
+        "<![if supportMisalignedColumns]>\n<tr height=0 style='display:none'>\n<td width=64 style='width:48pt'></td>\n</tr>\n<![endif]>",
+        "\n\n\n\n\n\n\n\n",
+    };
+    for (int i = 0 ; i < testGold.length ; i += 2) {
+      String test = testGold[i];
+      String gold = testGold[i + 1];
+      Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+      StringBuilder builder = new StringBuilder();
+      int ch = 0;
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+      String result = builder.toString();
+      assertEquals("Test: '" + test + "'", gold, result);
    }
-    String result = builder.toString();
-    // System.out.println("Resu: " + result + "<EOL>");
-    // System.out.println("Gold: " + gold + "<EOL>");
-    assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
  }

+
  public void testBufferOverflow() throws Exception {
-    StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.DEFAULT_READ_AHEAD + 50);
+    StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.getInitialBufferSize() + 50);
    testBuilder.append("ah<?> ??????");
-    appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
+    appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
    processBuffer(testBuilder.toString(), "Failed on pseudo proc. instr.");//processing instructions

    testBuilder.setLength(0);
    testBuilder.append("<!--");//comments
-    appendChars(testBuilder, 3*HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);//comments have two lookaheads
+    appendChars(testBuilder, 3 * HTMLStripCharFilter.getInitialBufferSize() + 500);//comments have two lookaheads

    testBuilder.append("-->foo");
-    processBuffer(testBuilder.toString(), "Failed w/ comment");
+    String gold = "foo";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());

    testBuilder.setLength(0);
    testBuilder.append("<?");
-    appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
+    appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
    testBuilder.append("?>");
-    processBuffer(testBuilder.toString(), "Failed with proc. instr.");
+    gold = "";
+    reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
+    ch = 0;
+    builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
    
    testBuilder.setLength(0);
    testBuilder.append("<b ");
-    appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
+    appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
    testBuilder.append("/>");
-    processBuffer(testBuilder.toString(), "Failed on tag");
-
+    gold = "";
+    reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
+    ch = 0;
+    builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
  }

  private void appendChars(StringBuilder testBuilder, int numChars) {
@ -208,13 +425,14 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
    } finally {
      // System.out.println("String (trimmed): " + builder.toString().trim() + "<EOS>");
    }
-    assertTrue(assertMsg + "::: " + builder.toString() + " is not equal to " + test, builder.toString().equals(test) == true);
+    assertEquals(assertMsg + "::: " + builder.toString() + " is not equal to " + test,
+        test, builder.toString());
  }

  public void testComment() throws Exception {

    String test = "<!--- three dashes, still a valid comment ---> ";
-    String gold = "  ";
+    String gold = " ";
    Reader reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
    int ch = 0;
    StringBuilder builder = new StringBuilder();
@ -225,7 +443,8 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
    } finally {
      // System.out.println("String: " + builder.toString());
    }
-    assertTrue(builder.toString() + " is not equal to " + gold + "<EOS>", builder.toString().equals(gold) == true);
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
  }


@ -247,15 +466,32 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
  }

  public void testOffsets() throws Exception {
-    doTestOffsets("hello X how X are you");
+//    doTestOffsets("hello X how X are you");
    doTestOffsets("hello <p> X<p> how <p>X are you");
    doTestOffsets("X &amp; X &#40; X &lt; &gt; X");

    // test backtracking
    doTestOffsets("X < &zz >X &# < X > < &l > &g < X");
  }
-  
-  @Ignore("broken offsets: see LUCENE-2208")
+
+  static void assertLegalOffsets(String in) throws Exception {
+    int length = in.length();
+    HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(in))));
+    int ch = 0;
+    int off = 0;
+    while ((ch = reader.read()) != -1) {
+      int correction = reader.correctOffset(off);
+      assertTrue("invalid offset correction: " + off + "->" + correction + " for doc of length: " + length,
+          correction <= length);
+      off++;
+    }
+  }
+
+  public void testLegalOffsets() throws Exception {
+    assertLegalOffsets("hello world");
+    assertLegalOffsets("hello &#x world");
+  }
+
  public void testRandom() throws Exception {
    Analyzer analyzer = new Analyzer() {

@ -267,11 +503,361 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {

      @Override
      protected Reader initReader(Reader reader) {
-        return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
+        return new HTMLStripCharFilter(CharReader.get(reader));
      }
    };
    
    int numRounds = RANDOM_MULTIPLIER * 10000;
    checkRandomData(random, analyzer, numRounds);
  }
+  
+  public void testServerSideIncludes() throws Exception {
+    String test = "one<img src=\"image.png\"\n"
+        + " alt =  \"Alt: <!--#echo var='${IMAGE_CAPTION:<!--comment-->\\'Comment\\'}'  -->\"\n\n"
+        + " title=\"Title: <!--#echo var=\"IMAGE_CAPTION\"-->\">two";
+    String gold = "onetwo";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertTrue(builder.toString() + " is not equal to " + gold, builder.toString().equals(gold));
+
+    test = "one<script><!-- <!--#config comment=\"<!-- \\\"comment\\\"-->\"--> --></script>two";
+    gold = "one\ntwo";
+    reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    ch = 0;
+    builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+
+  public void testScriptQuotes() throws Exception {
+    String test = "one<script attr= bare><!-- action('<!-- comment -->', \"\\\"-->\\\"\"); --></script>two";
+    String gold = "one\ntwo";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+        gold, builder.toString());
+
+    test = "hello<script><!-- f('<!--internal--></script>'); --></script>";
+    gold = "hello\n";
+    reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    ch = 0;
+    builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+
+  public void testEscapeScript() throws Exception {
+    String test = "one<script no-value-attr>callSomeMethod();</script>two";
+    String gold = "one<script no-value-attr></script>two";
+    Set<String> escapedTags = new HashSet<String>(Arrays.asList("SCRIPT"));
+    Reader reader = new HTMLStripCharFilter
+        (CharReader.get(new StringReader(test)), escapedTags);
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+
+  public void testStyle() throws Exception {
+    String test = "one<style type=\"text/css\">\n"
+                + "<!--\n"
+                + "@import url('http://www.lasletrasdecanciones.com/css.css');\n"
+                + "-->\n"
+                + "</style>two";
+    String gold = "one\ntwo";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+        gold, builder.toString());
+  }
+
+  public void testEscapeStyle() throws Exception {
+    String test = "one<style type=\"text/css\"> body,font,a { font-family:arial; } </style>two";
+    String gold = "one<style type=\"text/css\"></style>two";
+    Set<String> escapedTags = new HashSet<String>(Arrays.asList("STYLE"));
+    Reader reader = new HTMLStripCharFilter
+        (CharReader.get(new StringReader(test)), escapedTags);
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+        gold, builder.toString());
+  }
+
+  public void testBR() throws Exception {
+    String[] testGold = {
+        "one<BR />two<br>three",
+        "one\ntwo\nthree",
+
+        "one<BR some stuff here too>two</BR>",
+        "one\ntwo\n",
+    };
+    for (int i = 0 ; i < testGold.length ; i += 2) {
+      String test = testGold[i];
+      String gold = testGold[i + 1];
+      Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+      StringBuilder builder = new StringBuilder();
+      int ch = 0;
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+      String result = builder.toString();
+      assertEquals("Test: '" + test + "'", gold, result);
+    }
+  }
+  public void testEscapeBR() throws Exception {
+    String test = "one<BR class='whatever'>two</\nBR\n>";
+    String gold = "one<BR class='whatever'>two</\nBR\n>";
+    Set<String> escapedTags = new HashSet<String>(Arrays.asList("BR"));
+    Reader reader = new HTMLStripCharFilter
+        (CharReader.get(new StringReader(test)), escapedTags);
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+  
+  public void testInlineTagsNoSpace() throws Exception {
+    String test = "one<sPAn class=\"invisible\">two<sup>2<sup>e</sup></sup>.</SpaN>three";
+    String gold = "onetwo2e.three";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+
+  public void testCDATA() throws Exception {
+    String test = "one<![CDATA[<one><two>three<four></four></two></one>]]>two";
+    String gold = "one<one><two>three<four></four></two></one>two";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+
+    test = "one<![CDATA[two<![CDATA[three]]]]><![CDATA[>four]]>five";
+    gold = "onetwo<![CDATA[three]]>fourfive";
+    reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    ch = 0;
+    builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+
+  public void testUppercaseCharacterEntityVariants() throws Exception {
+    String test = " &QUOT;-&COPY;&GT;>&LT;<&REG;&AMP;";
+    String gold = " \"-\u00A9>><<\u00AE&";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+  
+  public void testMSWordMalformedProcessingInstruction() throws Exception {
+    String test = "one<?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" />two";
+    String gold = "onetwo";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
+
+  public void testSupplementaryCharsInTags() throws Exception {
+    String test = "one<𩬅艱鍟䇹愯瀛>two<瀛愯𩬅>three 瀛愯𩬅</瀛愯𩬅>four</𩬅艱鍟䇹愯瀛>five<𠀀𠀀>six<𠀀𠀀/>seven";
+    String gold = "one\ntwo\nthree 瀛愯𩬅\nfour\nfive\nsix\nseven";
+    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1){
+        builder.append((char)ch);
+      }
+    } finally {
+      // System.out.println("String: " + builder.toString());
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+        gold, builder.toString());
+  }
+
+  public void testRandomBrokenHTML() throws Exception {
+    int maxNumElements = 10000;
+    String text = _TestUtil.randomHtmlishString(random, maxNumElements);
+    Reader reader = new HTMLStripCharFilter
+        (CharReader.get(new StringReader(text)));
+    while (reader.read() != -1);
+  }
+
+  public void testRandomText() throws Exception {
+    StringBuilder text = new StringBuilder();
+    int minNumWords = 10;
+    int maxNumWords = 10000;
+    int minWordLength = 3;
+    int maxWordLength = 20;
+    int numWords = _TestUtil.nextInt(random, minNumWords, maxNumWords);
+    switch (_TestUtil.nextInt(random, 0, 4)) {
+      case 0: {
+        for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
+          text.append(_TestUtil.randomUnicodeString(random, maxWordLength));
+          text.append(' ');
+        }
+        break;
+      }
+      case 1: {
+        for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
+          text.append(_TestUtil.randomRealisticUnicodeString
+              (random, minWordLength, maxWordLength));
+          text.append(' ');
+        }
+        break;
+      }
+      default: { // ASCII 50% of the time
+        for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
+          text.append(_TestUtil.randomSimpleString(random));
+          text.append(' ');
+        }
+      }
+    }
+    Reader reader = new HTMLStripCharFilter
+        (CharReader.get(new StringReader(text.toString())));
+    while (reader.read() != -1);
+  }
+
+  public void testUTF16Surrogates() throws Exception {
+    Analyzer analyzer = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }
+
+      @Override
+      protected Reader initReader(Reader reader) {
+        return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
+      }
+    };
+    // Paired surrogates
+    assertAnalyzesTo(analyzer, " one two &#xD86C;&#XdC01;three",
+        new String[] { "one", "two", "\uD86C\uDC01three" } );
+    assertAnalyzesTo(analyzer, " &#55404;&#XdC01;", new String[] { "\uD86C\uDC01" } );
+    assertAnalyzesTo(analyzer, " &#xD86C;&#56321;", new String[] { "\uD86C\uDC01" } );
+    assertAnalyzesTo(analyzer, " &#55404;&#56321;", new String[] { "\uD86C\uDC01" } );
+
+    // Improperly paired surrogates
+    assertAnalyzesTo(analyzer, " &#55404;&#57999;", new String[] { "\uFFFD\uE28F" } );
+    assertAnalyzesTo(analyzer, " &#xD86C;&#57999;", new String[] { "\uFFFD\uE28F" } );
+    assertAnalyzesTo(analyzer, " &#55002;&#XdC01;", new String[] { "\uD6DA\uFFFD" } );
+    assertAnalyzesTo(analyzer, " &#55002;&#56321;", new String[] { "\uD6DA\uFFFD" } );
+
+    // Unpaired high surrogates
+    assertAnalyzesTo(analyzer, " &#Xd921;", new String[] { "\uFFFD" } );
+    assertAnalyzesTo(analyzer, " &#Xd921", new String[] { "\uFFFD" } );
+    assertAnalyzesTo(analyzer, " &#Xd921<br>", new String[] { "&#Xd921" } );
+    assertAnalyzesTo(analyzer, " &#55528;", new String[] { "\uFFFD" } );
+    assertAnalyzesTo(analyzer, " &#55528", new String[] { "\uFFFD" } );
+    assertAnalyzesTo(analyzer, " &#55528<br>", new String[] { "&#55528" } );
+
+    // Unpaired low surrogates
+    assertAnalyzesTo(analyzer, " &#xdfdb;", new String[] { "\uFFFD" } );
+    assertAnalyzesTo(analyzer, " &#xdfdb", new String[] { "\uFFFD" } );
+    assertAnalyzesTo(analyzer, " &#xdfdb<br>", new String[] { "&#xdfdb" } );
+    assertAnalyzesTo(analyzer, " &#57209;", new String[] { "\uFFFD" } );
+    assertAnalyzesTo(analyzer, " &#57209", new String[] { "\uFFFD" } );
+    assertAnalyzesTo(analyzer, " &#57209<br>", new String[] { "&#57209" } );
+  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/MS-Word
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/MS-Word
@ -0,0 +1,653 @@
+<html xmlns:v="urn:schemas-microsoft-com:vml"
+      xmlns:o="urn:schemas-microsoft-com:office:office"
+      xmlns:w="urn:schemas-microsoft-com:office:word"
+      xmlns:m="http://schemas.microsoft.com/office/2004/12/omml"
+      xmlns="http://www.w3.org/TR/REC-html40">
+
+<head>
+<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
+<meta name=ProgId content=Word.Document>
+<meta name=Generator content="Microsoft Word 14">
+<meta name=Originator content="Microsoft Word 14">
+<link rel=File-List href="This%20is%20a%20test_files/filelist.xml">
+<!--[if gte mso 9]><xml>
+  <o:DocumentProperties>
+    <o:Author>s</o:Author>
+    <o:LastAuthor>s</o:LastAuthor>
+    <o:Revision>1</o:Revision>
+    <o:TotalTime>1</o:TotalTime>
+    <o:Created>2012-01-13T03:36:00Z</o:Created>
+    <o:LastSaved>2012-01-13T03:37:00Z</o:LastSaved>
+    <o:Pages>1</o:Pages>
+    <o:Words>8</o:Words>
+    <o:Characters>48</o:Characters>
+    <o:Lines>1</o:Lines>
+    <o:Paragraphs>1</o:Paragraphs>
+    <o:CharactersWithSpaces>55</o:CharactersWithSpaces>
+    <o:Version>14.00</o:Version>
+  </o:DocumentProperties>
+  <o:OfficeDocumentSettings>
+    <o:AllowPNG/>
+  </o:OfficeDocumentSettings>
+</xml><![endif]-->
+<link rel=themeData href="This%20is%20a%20test_files/themedata.thmx">
+<link rel=colorSchemeMapping
+      href="This%20is%20a%20test_files/colorschememapping.xml">
+<!--[if gte mso 9]><xml>
+  <w:WordDocument>
+    <w:SpellingState>Clean</w:SpellingState>
+    <w:GrammarState>Clean</w:GrammarState>
+    <w:TrackMoves>false</w:TrackMoves>
+    <w:TrackFormatting/>
+    <w:PunctuationKerning/>
+    <w:ValidateAgainstSchemas/>
+    <w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid>
+    <w:IgnoreMixedContent>false</w:IgnoreMixedContent>
+    <w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText>
+    <w:DoNotPromoteQF/>
+    <w:LidThemeOther>EN-US</w:LidThemeOther>
+    <w:LidThemeAsian>X-NONE</w:LidThemeAsian>
+    <w:LidThemeComplexScript>X-NONE</w:LidThemeComplexScript>
+    <w:Compatibility>
+      <w:BreakWrappedTables/>
+      <w:SnapToGridInCell/>
+      <w:WrapTextWithPunct/>
+      <w:UseAsianBreakRules/>
+      <w:DontGrowAutofit/>
+      <w:SplitPgBreakAndParaMark/>
+      <w:EnableOpenTypeKerning/>
+      <w:DontFlipMirrorIndents/>
+      <w:OverrideTableStyleHps/>
+    </w:Compatibility>
+    <m:mathPr>
+      <m:mathFont m:val="Cambria Math"/>
+      <m:brkBin m:val="before"/>
+      <m:brkBinSub m:val="&#45;-"/>
+      <m:smallFrac m:val="off"/>
+      <m:dispDef/>
+      <m:lMargin m:val="0"/>
+      <m:rMargin m:val="0"/>
+      <m:defJc m:val="centerGroup"/>
+      <m:wrapIndent m:val="1440"/>
+      <m:intLim m:val="subSup"/>
+      <m:naryLim m:val="undOvr"/>
+    </m:mathPr></w:WordDocument>
+</xml><![endif]--><!--[if gte mso 9]><xml>
+<w:LatentStyles DefLockedState="false" DefUnhideWhenUsed="true"
+                DefSemiHidden="true" DefQFormat="false" DefPriority="99"
+                LatentStyleCount="267">
+<w:LsdException Locked="false" Priority="0" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Normal"/>
+<w:LsdException Locked="false" Priority="9" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="heading 1"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 2"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 3"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 4"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 5"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 6"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 7"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 8"/>
+<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 9"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 1"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 2"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 3"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 4"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 5"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 6"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 7"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 8"/>
+<w:LsdException Locked="false" Priority="39" Name="toc 9"/>
+<w:LsdException Locked="false" Priority="35" QFormat="true" Name="caption"/>
+<w:LsdException Locked="false" Priority="10" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Title"/>
+<w:LsdException Locked="false" Priority="1" Name="Default Paragraph Font"/>
+<w:LsdException Locked="false" Priority="11" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Subtitle"/>
+<w:LsdException Locked="false" Priority="22" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Strong"/>
+<w:LsdException Locked="false" Priority="20" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Emphasis"/>
+<w:LsdException Locked="false" Priority="59" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Table Grid"/>
+<w:LsdException Locked="false" UnhideWhenUsed="false" Name="Placeholder Text"/>
+<w:LsdException Locked="false" Priority="1" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="No Spacing"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Shading"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light List"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Grid"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 1"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 2"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 1"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 2"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 1"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 2"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 3"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Dark List"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Shading"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful List"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Grid"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Shading Accent 1"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light List Accent 1"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Grid Accent 1"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 1 Accent 1"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 2 Accent 1"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 1 Accent 1"/>
+<w:LsdException Locked="false" UnhideWhenUsed="false" Name="Revision"/>
+<w:LsdException Locked="false" Priority="34" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="List Paragraph"/>
+<w:LsdException Locked="false" Priority="29" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Quote"/>
+<w:LsdException Locked="false" Priority="30" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Intense Quote"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 2 Accent 1"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 1 Accent 1"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 2 Accent 1"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 3 Accent 1"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Dark List Accent 1"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Shading Accent 1"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful List Accent 1"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Grid Accent 1"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Shading Accent 2"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light List Accent 2"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Grid Accent 2"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 1 Accent 2"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 2 Accent 2"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 1 Accent 2"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 2 Accent 2"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 1 Accent 2"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 2 Accent 2"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 3 Accent 2"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Dark List Accent 2"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Shading Accent 2"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful List Accent 2"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Grid Accent 2"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Shading Accent 3"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light List Accent 3"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Grid Accent 3"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 1 Accent 3"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 2 Accent 3"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 1 Accent 3"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 2 Accent 3"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 1 Accent 3"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 2 Accent 3"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 3 Accent 3"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Dark List Accent 3"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Shading Accent 3"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful List Accent 3"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Grid Accent 3"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Shading Accent 4"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light List Accent 4"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Grid Accent 4"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 1 Accent 4"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 2 Accent 4"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 1 Accent 4"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 2 Accent 4"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 1 Accent 4"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 2 Accent 4"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 3 Accent 4"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Dark List Accent 4"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Shading Accent 4"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful List Accent 4"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Grid Accent 4"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Shading Accent 5"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light List Accent 5"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Grid Accent 5"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 1 Accent 5"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 2 Accent 5"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 1 Accent 5"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 2 Accent 5"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 1 Accent 5"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 2 Accent 5"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 3 Accent 5"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Dark List Accent 5"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Shading Accent 5"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful List Accent 5"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Grid Accent 5"/>
+<w:LsdException Locked="false" Priority="60" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Shading Accent 6"/>
+<w:LsdException Locked="false" Priority="61" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light List Accent 6"/>
+<w:LsdException Locked="false" Priority="62" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Light Grid Accent 6"/>
+<w:LsdException Locked="false" Priority="63" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 1 Accent 6"/>
+<w:LsdException Locked="false" Priority="64" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Shading 2 Accent 6"/>
+<w:LsdException Locked="false" Priority="65" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 1 Accent 6"/>
+<w:LsdException Locked="false" Priority="66" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium List 2 Accent 6"/>
+<w:LsdException Locked="false" Priority="67" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 1 Accent 6"/>
+<w:LsdException Locked="false" Priority="68" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 2 Accent 6"/>
+<w:LsdException Locked="false" Priority="69" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Medium Grid 3 Accent 6"/>
+<w:LsdException Locked="false" Priority="70" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Dark List Accent 6"/>
+<w:LsdException Locked="false" Priority="71" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Shading Accent 6"/>
+<w:LsdException Locked="false" Priority="72" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful List Accent 6"/>
+<w:LsdException Locked="false" Priority="73" SemiHidden="false"
+                UnhideWhenUsed="false" Name="Colorful Grid Accent 6"/>
+<w:LsdException Locked="false" Priority="19" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Subtle Emphasis"/>
+<w:LsdException Locked="false" Priority="21" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Intense Emphasis"/>
+<w:LsdException Locked="false" Priority="31" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Subtle Reference"/>
+<w:LsdException Locked="false" Priority="32" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Intense Reference"/>
+<w:LsdException Locked="false" Priority="33" SemiHidden="false"
+                UnhideWhenUsed="false" QFormat="true" Name="Book Title"/>
+<w:LsdException Locked="false" Priority="37" Name="Bibliography"/>
+<w:LsdException Locked="false" Priority="39" QFormat="true" Name="TOC Heading"/>
+</w:LatentStyles>
+</xml><![endif]-->
+<style>
+<!--
+  /* Font Definitions */
+@font-face
+{font-family:"Cambria Math";
+  panose-1:2 4 5 3 5 4 6 3 2 4;
+  mso-font-charset:1;
+  mso-generic-font-family:roman;
+  mso-font-format:other;
+  mso-font-pitch:variable;
+  mso-font-signature:0 0 0 0 0 0;}
+@font-face
+{font-family:Cambria;
+  panose-1:2 4 5 3 5 4 6 3 2 4;
+  mso-font-charset:0;
+  mso-generic-font-family:roman;
+  mso-font-pitch:variable;
+  mso-font-signature:-536870145 1073743103 0 0 415 0;}
+@font-face
+{font-family:Calibri;
+  panose-1:2 15 5 2 2 2 4 3 2 4;
+  mso-font-charset:0;
+  mso-generic-font-family:swiss;
+  mso-font-pitch:variable;
+  mso-font-signature:-520092929 1073786111 9 0 415 0;}
+  /* Style Definitions */
+p.MsoNormal, li.MsoNormal, div.MsoNormal
+{mso-style-unhide:no;
+  mso-style-qformat:yes;
+  mso-style-parent:"";
+  margin-top:0in;
+  margin-right:0in;
+  margin-bottom:10.0pt;
+  margin-left:0in;
+  line-height:115%;
+  mso-pagination:widow-orphan;
+  font-size:11.0pt;
+  font-family:"Calibri","sans-serif";
+  mso-ascii-font-family:Calibri;
+  mso-ascii-theme-font:minor-latin;
+  mso-fareast-font-family:Calibri;
+  mso-fareast-theme-font:minor-latin;
+  mso-hansi-font-family:Calibri;
+  mso-hansi-theme-font:minor-latin;
+  mso-bidi-font-family:"Times New Roman";
+  mso-bidi-theme-font:minor-bidi;}
+h1
+{mso-style-priority:9;
+  mso-style-unhide:no;
+  mso-style-qformat:yes;
+  mso-style-link:"Heading 1 Char";
+  mso-style-next:Normal;
+  margin-top:24.0pt;
+  margin-right:0in;
+  margin-bottom:0in;
+  margin-left:0in;
+  margin-bottom:.0001pt;
+  line-height:115%;
+  mso-pagination:widow-orphan lines-together;
+  page-break-after:avoid;
+  mso-outline-level:1;
+  font-size:14.0pt;
+  font-family:"Cambria","serif";
+  mso-ascii-font-family:Cambria;
+  mso-ascii-theme-font:major-latin;
+  mso-fareast-font-family:"Times New Roman";
+  mso-fareast-theme-font:major-fareast;
+  mso-hansi-font-family:Cambria;
+  mso-hansi-theme-font:major-latin;
+  mso-bidi-font-family:"Times New Roman";
+  mso-bidi-theme-font:major-bidi;
+  color:#365F91;
+  mso-themecolor:accent1;
+  mso-themeshade:191;
+  mso-font-kerning:0pt;}
+p.MsoTitle, li.MsoTitle, div.MsoTitle
+{mso-style-priority:10;
+  mso-style-unhide:no;
+  mso-style-qformat:yes;
+  mso-style-link:"Title Char";
+  mso-style-next:Normal;
+  margin-top:0in;
+  margin-right:0in;
+  margin-bottom:15.0pt;
+  margin-left:0in;
+  mso-add-space:auto;
+  mso-pagination:widow-orphan;
+  border:none;
+  mso-border-bottom-alt:solid #4F81BD 1.0pt;
+  mso-border-bottom-themecolor:accent1;
+  padding:0in;
+  mso-padding-alt:0in 0in 4.0pt 0in;
+  font-size:26.0pt;
+  font-family:"Cambria","serif";
+  mso-ascii-font-family:Cambria;
+  mso-ascii-theme-font:major-latin;
+  mso-fareast-font-family:"Times New Roman";
+  mso-fareast-theme-font:major-fareast;
+  mso-hansi-font-family:Cambria;
+  mso-hansi-theme-font:major-latin;
+  mso-bidi-font-family:"Times New Roman";
+  mso-bidi-theme-font:major-bidi;
+  color:#17365D;
+  mso-themecolor:text2;
+  mso-themeshade:191;
+  letter-spacing:.25pt;
+  mso-font-kerning:14.0pt;}
+p.MsoTitleCxSpFirst, li.MsoTitleCxSpFirst, div.MsoTitleCxSpFirst
+{mso-style-priority:10;
+  mso-style-unhide:no;
+  mso-style-qformat:yes;
+  mso-style-link:"Title Char";
+  mso-style-next:Normal;
+  mso-style-type:export-only;
+  margin:0in;
+  margin-bottom:.0001pt;
+  mso-add-space:auto;
+  mso-pagination:widow-orphan;
+  border:none;
+  mso-border-bottom-alt:solid #4F81BD 1.0pt;
+  mso-border-bottom-themecolor:accent1;
+  padding:0in;
+  mso-padding-alt:0in 0in 4.0pt 0in;
+  font-size:26.0pt;
+  font-family:"Cambria","serif";
+  mso-ascii-font-family:Cambria;
+  mso-ascii-theme-font:major-latin;
+  mso-fareast-font-family:"Times New Roman";
+  mso-fareast-theme-font:major-fareast;
+  mso-hansi-font-family:Cambria;
+  mso-hansi-theme-font:major-latin;
+  mso-bidi-font-family:"Times New Roman";
+  mso-bidi-theme-font:major-bidi;
+  color:#17365D;
+  mso-themecolor:text2;
+  mso-themeshade:191;
+  letter-spacing:.25pt;
+  mso-font-kerning:14.0pt;}
+p.MsoTitleCxSpMiddle, li.MsoTitleCxSpMiddle, div.MsoTitleCxSpMiddle
+{mso-style-priority:10;
+  mso-style-unhide:no;
+  mso-style-qformat:yes;
+  mso-style-link:"Title Char";
+  mso-style-next:Normal;
+  mso-style-type:export-only;
+  margin:0in;
+  margin-bottom:.0001pt;
+  mso-add-space:auto;
+  mso-pagination:widow-orphan;
+  border:none;
+  mso-border-bottom-alt:solid #4F81BD 1.0pt;
+  mso-border-bottom-themecolor:accent1;
+  padding:0in;
+  mso-padding-alt:0in 0in 4.0pt 0in;
+  font-size:26.0pt;
+  font-family:"Cambria","serif";
+  mso-ascii-font-family:Cambria;
+  mso-ascii-theme-font:major-latin;
+  mso-fareast-font-family:"Times New Roman";
+  mso-fareast-theme-font:major-fareast;
+  mso-hansi-font-family:Cambria;
+  mso-hansi-theme-font:major-latin;
+  mso-bidi-font-family:"Times New Roman";
+  mso-bidi-theme-font:major-bidi;
+  color:#17365D;
+  mso-themecolor:text2;
+  mso-themeshade:191;
+  letter-spacing:.25pt;
+  mso-font-kerning:14.0pt;}
+p.MsoTitleCxSpLast, li.MsoTitleCxSpLast, div.MsoTitleCxSpLast
+{mso-style-priority:10;
+  mso-style-unhide:no;
+  mso-style-qformat:yes;
+  mso-style-link:"Title Char";
+  mso-style-next:Normal;
+  mso-style-type:export-only;
+  margin-top:0in;
+  margin-right:0in;
+  margin-bottom:15.0pt;
+  margin-left:0in;
+  mso-add-space:auto;
+  mso-pagination:widow-orphan;
+  border:none;
+  mso-border-bottom-alt:solid #4F81BD 1.0pt;
+  mso-border-bottom-themecolor:accent1;
+  padding:0in;
+  mso-padding-alt:0in 0in 4.0pt 0in;
+  font-size:26.0pt;
+  font-family:"Cambria","serif";
+  mso-ascii-font-family:Cambria;
+  mso-ascii-theme-font:major-latin;
+  mso-fareast-font-family:"Times New Roman";
+  mso-fareast-theme-font:major-fareast;
+  mso-hansi-font-family:Cambria;
+  mso-hansi-theme-font:major-latin;
+  mso-bidi-font-family:"Times New Roman";
+  mso-bidi-theme-font:major-bidi;
+  color:#17365D;
+  mso-themecolor:text2;
+  mso-themeshade:191;
+  letter-spacing:.25pt;
+  mso-font-kerning:14.0pt;}
+span.TitleChar
+{mso-style-name:"Title Char";
+  mso-style-priority:10;
+  mso-style-unhide:no;
+  mso-style-locked:yes;
+  mso-style-link:Title;
+  mso-ansi-font-size:26.0pt;
+  mso-bidi-font-size:26.0pt;
+  font-family:"Cambria","serif";
+  mso-ascii-font-family:Cambria;
+  mso-ascii-theme-font:major-latin;
+  mso-fareast-font-family:"Times New Roman";
+  mso-fareast-theme-font:major-fareast;
+  mso-hansi-font-family:Cambria;
+  mso-hansi-theme-font:major-latin;
+  mso-bidi-font-family:"Times New Roman";
+  mso-bidi-theme-font:major-bidi;
+  color:#17365D;
+  mso-themecolor:text2;
+  mso-themeshade:191;
+  letter-spacing:.25pt;
+  mso-font-kerning:14.0pt;}
+span.Heading1Char
+{mso-style-name:"Heading 1 Char";
+  mso-style-priority:9;
+  mso-style-unhide:no;
+  mso-style-locked:yes;
+  mso-style-link:"Heading 1";
+  mso-ansi-font-size:14.0pt;
+  mso-bidi-font-size:14.0pt;
+  font-family:"Cambria","serif";
+  mso-ascii-font-family:Cambria;
+  mso-ascii-theme-font:major-latin;
+  mso-fareast-font-family:"Times New Roman";
+  mso-fareast-theme-font:major-fareast;
+  mso-hansi-font-family:Cambria;
+  mso-hansi-theme-font:major-latin;
+  mso-bidi-font-family:"Times New Roman";
+  mso-bidi-theme-font:major-bidi;
+  color:#365F91;
+  mso-themecolor:accent1;
+  mso-themeshade:191;
+  font-weight:bold;}
+.MsoChpDefault
+{mso-style-type:export-only;
+  mso-default-props:yes;
+  font-family:"Calibri","sans-serif";
+  mso-ascii-font-family:Calibri;
+  mso-ascii-theme-font:minor-latin;
+  mso-fareast-font-family:Calibri;
+  mso-fareast-theme-font:minor-latin;
+  mso-hansi-font-family:Calibri;
+  mso-hansi-theme-font:minor-latin;
+  mso-bidi-font-family:"Times New Roman";
+  mso-bidi-theme-font:minor-bidi;}
+.MsoPapDefault
+{mso-style-type:export-only;
+  margin-bottom:10.0pt;
+  line-height:115%;}
+@page WordSection1
+{size:8.5in 11.0in;
+  margin:1.0in 1.0in 1.0in 1.0in;
+  mso-header-margin:.5in;
+  mso-footer-margin:.5in;
+  mso-paper-source:0;}
+div.WordSection1
+{page:WordSection1;}
+-->
+</style>
+<!--[if gte mso 10]>
+<style>
+    /* Style Definitions */
+  table.MsoNormalTable
+  {mso-style-name:"Table Normal";
+    mso-tstyle-rowband-size:0;
+    mso-tstyle-colband-size:0;
+    mso-style-noshow:yes;
+    mso-style-priority:99;
+    mso-style-parent:"";
+    mso-padding-alt:0in 5.4pt 0in 5.4pt;
+    mso-para-margin-top:0in;
+    mso-para-margin-right:0in;
+    mso-para-margin-bottom:10.0pt;
+    mso-para-margin-left:0in;
+    line-height:115%;
+    mso-pagination:widow-orphan;
+    font-size:11.0pt;
+    font-family:"Calibri","sans-serif";
+    mso-ascii-font-family:Calibri;
+    mso-ascii-theme-font:minor-latin;
+    mso-hansi-font-family:Calibri;
+    mso-hansi-theme-font:minor-latin;
+    mso-bidi-font-family:"Times New Roman";
+    mso-bidi-theme-font:minor-bidi;}
+</style>
+<![endif]--><!--[if gte mso 9]><xml>
+  <o:shapedefaults v:ext="edit" spidmax="1026"/>
+</xml><![endif]--><!--[if gte mso 9]><xml>
+  <o:shapelayout v:ext="edit">
+    <o:idmap v:ext="edit" data="1"/>
+  </o:shapelayout></xml><![endif]-->
+</head>
+
+<body lang=EN-US style='tab-interval:.5in'>
+
+<div class=WordSection1>
+
+  <div style='mso-element:para-border-div;border:none;border-bottom:solid #4F81BD 1.0pt;
+mso-border-bottom-themecolor:accent1;padding:0in 0in 4.0pt 0in'>
+
+    <p class=MsoTitle>This is a test</p>
+
+  </div>
+
+</div>
+
+</body>
+
+</html>
+
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/cn/TestChineseTokenizer.java
@ -117,5 +117,10 @@ public class TestChineseTokenizer extends BaseTokenStreamTestCase
      assertAnalyzesTo(justFilter, "This is a Test. b c d", 
          new String[] { "This", "Test." });
    }
+    
+    /** blast some random strings through the analyzer */
+    public void testRandomStrings() throws Exception {
+      checkRandomData(random, new ChineseAnalyzer(), 10000*RANDOM_MULTIPLIER);
+    }

 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/commongrams/CommonGramsFilterTest.java
@ -306,4 +306,31 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
    TokenFilter nsf = new CommonGramsQueryFilter(cgf);
    assertTokenStreamContents(nsf, new String[] { "the_of" });
  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, t, commonWords);
+        return new TokenStreamComponents(t, cgf);
+      }
+    };
+    
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+    
+    Analyzer b = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, t, commonWords);
+        return new TokenStreamComponents(t, new CommonGramsQueryFilter(cgf));
+      }
+    };
+    
+    checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
+  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
@ -18,14 +18,19 @@ package org.apache.lucene.analysis.compound;
 */

 import java.io.IOException;
+import java.io.Reader;
 import java.io.StringReader;
 import java.util.Arrays;

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharReader;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.charfilter.MappingCharFilter;
+import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
 import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.util.CharArraySet;
@ -299,5 +304,61 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
      }
    }
  }
+  
+  // SOLR-2891
+  // *CompoundWordTokenFilter blindly adds term length to offset, but this can take things out of bounds
+  // wrt original text if a previous filter increases the length of the word (in this case ü -> ue)
+  // so in this case we behave like WDF, and preserve any modified offsets
+  public void testInvalidOffsets() throws Exception {
+    final CharArraySet dict = makeDictionary("fall");
+    final NormalizeCharMap normMap = new NormalizeCharMap();
+    normMap.add("ü", "ue");
+    
+    Analyzer analyzer = new Analyzer() {

+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        TokenFilter filter = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict);
+        return new TokenStreamComponents(tokenizer, filter);
+      }
+
+      @Override
+      protected Reader initReader(Reader reader) {
+        return new MappingCharFilter(normMap, CharReader.get(reader));
+      }
+    };
+
+    assertAnalyzesTo(analyzer, "banküberfall", 
+        new String[] { "bankueberfall", "fall" },
+        new int[] { 0,  0 },
+        new int[] { 12, 12 });
+  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    final CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
+    Analyzer a = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict));
+      }
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+    
+    InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
+    final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
+    Analyzer b = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        TokenFilter filter = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, hyphenator);
+        return new TokenStreamComponents(tokenizer, filter);
+      }
+    };
+    checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
+  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestTypeTokenFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestTypeTokenFilter.java
@ -0,0 +1,92 @@
+package org.apache.lucene.analysis.core;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.English;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.Set;
+
+
+public class TestTypeTokenFilter extends BaseTokenStreamTestCase {
+
+  public void testTypeFilter() throws IOException {
+    StringReader reader = new StringReader("121 is palindrome, while 123 is not");
+    Set<String> stopTypes = asSet("<NUM>");
+    TokenStream stream = new TypeTokenFilter(true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopTypes);
+    assertTokenStreamContents(stream, new String[]{"is", "palindrome", "while", "is", "not"});
+  }
+
+  /**
+   * Test Position increments applied by TypeTokenFilter with and without enabling this option.
+   */
+  public void testStopPositons() throws IOException {
+    StringBuilder sb = new StringBuilder();
+    for (int i = 10; i < 20; i++) {
+      if (i % 3 != 0) {
+        sb.append(i).append(" ");
+      } else {
+        String w = English.intToEnglish(i).trim();
+        sb.append(w).append(" ");
+      }
+    }
+    log(sb.toString());
+    String stopTypes[] = new String[]{"<NUM>"};
+    Set<String> stopSet = asSet(stopTypes);
+
+    // with increments
+    StringReader reader = new StringReader(sb.toString());
+    TypeTokenFilter typeTokenFilter = new TypeTokenFilter(true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
+    testPositons(typeTokenFilter);
+
+    // without increments
+    reader = new StringReader(sb.toString());
+    typeTokenFilter = new TypeTokenFilter(false, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
+    testPositons(typeTokenFilter);
+
+  }
+
+  private void testPositons(TypeTokenFilter stpf) throws IOException {
+    TypeAttribute typeAtt = stpf.getAttribute(TypeAttribute.class);
+    CharTermAttribute termAttribute = stpf.getAttribute(CharTermAttribute.class);
+    PositionIncrementAttribute posIncrAtt = stpf.getAttribute(PositionIncrementAttribute.class);
+    stpf.reset();
+    boolean enablePositionIncrements = stpf.getEnablePositionIncrements();
+    while (stpf.incrementToken()) {
+      log("Token: " + termAttribute.toString() + ": " + typeAtt.type() + " - " + posIncrAtt.getPositionIncrement());
+      assertEquals("if position increment is enabled the positionIncrementAttribute value should be 3, otherwise 1",
+          posIncrAtt.getPositionIncrement(), enablePositionIncrements ? 3 : 1);
+    }
+    stpf.end();
+    stpf.close();
+  }
+
+  // print debug info depending on VERBOSE
+  private static void log(String s) {
+    if (VERBOSE) {
+      System.out.println(s);
+    }
+  }
+}
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java
@ -18,12 +18,15 @@ package org.apache.lucene.analysis.hunspell;

 import java.io.IOException;
 import java.io.InputStream;
+import java.io.Reader;
 import java.io.StringReader;
 import java.text.ParseException;
 import java.util.Arrays;

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.junit.BeforeClass;
@ -57,4 +60,17 @@ public class HunspellStemFilterTest  extends BaseTokenStreamTestCase {
    filter = new HunspellStemFilter(new KeywordMarkerFilter(tokenizer, set), DICTIONARY);
    assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer analyzer = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY));
+      }  
+    };
+    checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
+  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/PatternAnalyzerTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/PatternAnalyzerTest.java
@ -22,6 +22,7 @@ import java.io.StringReader;
 import java.util.Arrays;
 import java.util.regex.Pattern;

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.core.StopAnalyzer;
@ -132,4 +133,10 @@ public class PatternAnalyzerTest extends BaseTokenStreamTestCase {
    TokenStream ts2 = analyzer.tokenStream("dummy", new StringReader(document));
    assertTokenStreamContents(ts2, expected);
  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java
@ -17,11 +17,14 @@ package org.apache.lucene.analysis.miscellaneous;
 * limitations under the License.
 */

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+import java.io.Reader;
 import java.io.StringReader;
 import java.util.List;
 import java.util.ArrayList;
@ -1907,4 +1910,17 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase {
    assertTrue(stream.incrementToken());
    assertEquals(expected, termAtt.toString());
  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new ASCIIFoldingFilter(tokenizer));
+      } 
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilter.java
@ -18,12 +18,14 @@
 package org.apache.lucene.analysis.miscellaneous;

 import java.io.IOException;
+import java.io.Reader;
 import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.List;

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Tokenizer;
@ -117,4 +119,18 @@ public class TestCapitalizationFilter extends BaseTokenStreamTestCase {
        new String[] { expected }, onlyFirstWord, keep, forceFirstLetter, okPrefix,
        minWordLength, maxWordCount, maxTokenLength);    
  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomString() throws Exception {
+    Analyzer a = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new CapitalizationFilter(tokenizer));
+      }
+    };
+    
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestHyphenatedWordsFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestHyphenatedWordsFilter.java
@ -17,11 +17,14 @@

 package org.apache.lucene.analysis.miscellaneous;

+import java.io.Reader;
 import java.io.StringReader;

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;

 /**
 * HyphenatedWordsFilter test
@ -46,5 +49,29 @@ public class TestHyphenatedWordsFilter extends BaseTokenStreamTestCase {
 	    ts = new HyphenatedWordsFilter(ts);
 	    assertTokenStreamContents(ts, 
 	        new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecology-" });
-	  }
+	}
+	
+	public void testOffsets() throws Exception {
+	  String input = "abc- def geh 1234- 5678-";
+    TokenStream ts = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
+    ts = new HyphenatedWordsFilter(ts);
+    assertTokenStreamContents(ts, 
+        new String[] { "abcdef", "geh", "12345678-" },
+        new int[] { 0, 9, 13 },
+        new int[] { 8, 12, 24 });
+	}
+	
+  /** blast some random strings through the analyzer */
+  public void testRandomString() throws Exception {
+    Analyzer a = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new HyphenatedWordsFilter(tokenizer));
+      }
+    };
+    
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java
@ -17,13 +17,16 @@

 package org.apache.lucene.analysis.miscellaneous;

+import java.io.Reader;
 import java.io.StringReader;
 import java.util.HashSet;
 import java.util.Set;

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.util.CharArraySet;

 /** Test {@link KeepWordFilter} */
@ -57,4 +60,23 @@ public class TestKeepWordFilter extends BaseTokenStreamTestCase {
    stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
    assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 1 });
  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    final Set<String> words = new HashSet<String>();
+    words.add( "a" );
+    words.add( "b" );
+    
+    Analyzer a = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        TokenStream stream = new KeepWordFilter(true, tokenizer, new CharArraySet(TEST_VERSION_CURRENT, words, true));
+        return new TokenStreamComponents(tokenizer, stream);
+      }
+    };
+    
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestRemoveDuplicatesTokenFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestRemoveDuplicatesTokenFilter.java
@ -17,13 +17,21 @@

 package org.apache.lucene.analysis.miscellaneous;

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.synonym.SynonymFilter;
+import org.apache.lucene.analysis.synonym.SynonymMap;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util._TestUtil;

+import java.io.Reader;
 import java.util.Iterator;
 import java.util.Arrays;

@ -116,6 +124,45 @@ public class TestRemoveDuplicatesTokenFilter extends BaseTokenStreamTestCase {
             
  }
  
+  // some helper methods for the below test with synonyms
+  private String randomNonEmptyString() {
+    while(true) {
+      final String s = _TestUtil.randomUnicodeString(random).trim();
+      if (s.length() != 0 && s.indexOf('\u0000') == -1) {
+        return s;
+      }
+    }
+  }
  
+  private void add(SynonymMap.Builder b, String input, String output, boolean keepOrig) {
+    b.add(new CharsRef(input.replaceAll(" +", "\u0000")),
+          new CharsRef(output.replaceAll(" +", "\u0000")),
+          keepOrig);
+  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    final int numIters = atLeast(10);
+    for (int i = 0; i < numIters; i++) {
+      SynonymMap.Builder b = new SynonymMap.Builder(random.nextBoolean());
+      final int numEntries = atLeast(10);
+      for (int j = 0; j < numEntries; j++) {
+        add(b, randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean());
+      }
+      final SynonymMap map = b.build();
+      final boolean ignoreCase = random.nextBoolean();
+      
+      final Analyzer analyzer = new Analyzer() {
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+          Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
+          TokenStream stream = new SynonymFilter(tokenizer, map, ignoreCase);
+          return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(stream));
+        }
+      };
+
+      checkRandomData(random, analyzer, 1000*RANDOM_MULTIPLIER);
+    }
+  }

 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java
@ -18,11 +18,15 @@
 package org.apache.lucene.analysis.miscellaneous;

 import java.io.IOException;
+import java.io.Reader;
 import java.util.Collection;

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.*;

 /**
@ -103,4 +107,27 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
      }
    }
  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
+        return new TokenStreamComponents(tokenizer, new TrimFilter(tokenizer, false));
+      } 
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+    
+    Analyzer b = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
+        return new TokenStreamComponents(tokenizer, new TrimFilter(tokenizer, true));
+      } 
+    };
+    checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
+  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
@ -298,4 +298,28 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
        new int[] { 10, 15, 15 },
        new int[] { 2, 1, 0 });
  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    int numIterations = atLeast(5);
+    for (int i = 0; i < numIterations; i++) {
+      final int flags = random.nextInt(512);
+      final CharArraySet protectedWords;
+      if (random.nextBoolean()) {
+        protectedWords = new CharArraySet(TEST_VERSION_CURRENT, new HashSet<String>(Arrays.asList("a", "b", "cd")), false);
+      } else {
+        protectedWords = null;
+      }
+      
+      Analyzer a = new Analyzer() {
+        
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+          Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+          return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
+        }
+      };
+      checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+    }
+  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
@ -129,4 +129,27 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
        new int[]    {    0,     0,      0,       0,        0,         0,          0,           0,            0,             0,              0 },
        new int[]    {   11,    11,     11,      11,       11,        11,         11,          11,           11,            11,             11 });
  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, 
+            new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.FRONT, 2, 15));
+      }    
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+    
+    Analyzer b = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, 
+            new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 15));
+      }    
+    };
+    checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
+  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
@ -18,9 +18,13 @@ package org.apache.lucene.analysis.ngram;
 */


+import java.io.Reader;
 import java.io.StringReader;

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;

 /**
 * Tests {@link EdgeNGramTokenizer} for correctness.
@ -95,4 +99,25 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
    tokenizer.reset(new StringReader("abcde"));
    assertTokenStreamContents(tokenizer, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3}, 5 /* abcde */);
  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new EdgeNGramTokenizer(reader, EdgeNGramTokenizer.Side.FRONT, 2, 15);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }    
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+    
+    Analyzer b = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new EdgeNGramTokenizer(reader, EdgeNGramTokenizer.Side.BACK, 2, 15);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }    
+    };
+    checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
+  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
@ -23,6 +23,7 @@ import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;

@ -33,89 +34,102 @@ import java.io.StringReader;
 * Tests {@link NGramTokenFilter} for correctness.
 */
 public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
-    private TokenStream input;
-    
-    @Override
-    public void setUp() throws Exception {
-        super.setUp();
-        input = new MockTokenizer(new StringReader("abcde"), MockTokenizer.WHITESPACE, false);
+  private TokenStream input;
+  
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    input = new MockTokenizer(new StringReader("abcde"), MockTokenizer.WHITESPACE, false);
+  }
+  
+  public void testInvalidInput() throws Exception {
+    boolean gotException = false;
+    try {        
+      new NGramTokenFilter(input, 2, 1);
+    } catch (IllegalArgumentException e) {
+      gotException = true;
    }
-
-    public void testInvalidInput() throws Exception {
-        boolean gotException = false;
-        try {        
-            new NGramTokenFilter(input, 2, 1);
-        } catch (IllegalArgumentException e) {
-            gotException = true;
-        }
-        assertTrue(gotException);
+    assertTrue(gotException);
+  }
+  
+  public void testInvalidInput2() throws Exception {
+    boolean gotException = false;
+    try {        
+      new NGramTokenFilter(input, 0, 1);
+    } catch (IllegalArgumentException e) {
+      gotException = true;
    }
-
-    public void testInvalidInput2() throws Exception {
-        boolean gotException = false;
-        try {        
-            new NGramTokenFilter(input, 0, 1);
-        } catch (IllegalArgumentException e) {
-            gotException = true;
-        }
-        assertTrue(gotException);
-    }
-
-    public void testUnigrams() throws Exception {
-      NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1);
-      assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
-    }
-
-    public void testBigrams() throws Exception {
-      NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2);
-      assertTokenStreamContents(filter, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5});
-    }
-
-    public void testNgrams() throws Exception {
-      NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
-      assertTokenStreamContents(filter,
+    assertTrue(gotException);
+  }
+  
+  public void testUnigrams() throws Exception {
+    NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1);
+    assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
+  }
+  
+  public void testBigrams() throws Exception {
+    NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2);
+    assertTokenStreamContents(filter, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5});
+  }
+  
+  public void testNgrams() throws Exception {
+    NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
+    assertTokenStreamContents(filter,
        new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"}, 
        new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
        new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5}
-      );
-    }
-
-    public void testOversizedNgrams() throws Exception {
-      NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7);
-      assertTokenStreamContents(filter, new String[0], new int[0], new int[0]);
-    }
-    
-    public void testSmallTokenInStream() throws Exception {
-      input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false);
-      NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
-      assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
-    }
-    
-    public void testReset() throws Exception {
-      WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
-      NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1);
-      assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
-      tokenizer.reset(new StringReader("abcde"));
-      assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
-    }
-    
-    // LUCENE-3642
-    // EdgeNgram blindly adds term length to offset, but this can take things out of bounds
-    // wrt original text if a previous filter increases the length of the word (in this case æ -> ae)
-    // so in this case we behave like WDF, and preserve any modified offsets
-    public void testInvalidOffsets() throws Exception {
-      Analyzer analyzer = new Analyzer() {
-        @Override
-        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-          Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
-          TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
-          filters = new NGramTokenFilter(filters, 2, 2);
-          return new TokenStreamComponents(tokenizer, filters);
-        }
-      };
-      assertAnalyzesTo(analyzer, "mosfellsbær",
-          new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" },
-          new int[]    {    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
-          new int[]    {   11,   11,   11,   11,   11,   11,   11,   11,   11,   11,   11 });
-    }
+        );
+  }
+  
+  public void testOversizedNgrams() throws Exception {
+    NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7);
+    assertTokenStreamContents(filter, new String[0], new int[0], new int[0]);
+  }
+  
+  public void testSmallTokenInStream() throws Exception {
+    input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false);
+    NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
+    assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
+  }
+  
+  public void testReset() throws Exception {
+    WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
+    NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1);
+    assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
+    tokenizer.reset(new StringReader("abcde"));
+    assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
+  }
+  
+  // LUCENE-3642
+  // EdgeNgram blindly adds term length to offset, but this can take things out of bounds
+  // wrt original text if a previous filter increases the length of the word (in this case æ -> ae)
+  // so in this case we behave like WDF, and preserve any modified offsets
+  public void testInvalidOffsets() throws Exception {
+    Analyzer analyzer = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
+        filters = new NGramTokenFilter(filters, 2, 2);
+        return new TokenStreamComponents(tokenizer, filters);
+      }
+    };
+    assertAnalyzesTo(analyzer, "mosfellsbær",
+        new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" },
+        new int[]    {    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+        new int[]    {   11,   11,   11,   11,   11,   11,   11,   11,   11,   11,   11 });
+  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, 
+            new NGramTokenFilter(tokenizer, 2, 15));
+      }    
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
@ -18,71 +18,86 @@ package org.apache.lucene.analysis.ngram;
 */


+import java.io.Reader;
 import java.io.StringReader;

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;

 /**
 * Tests {@link NGramTokenizer} for correctness.
 */
 public class NGramTokenizerTest extends BaseTokenStreamTestCase {
-    private StringReader input;
-    
-    @Override
-    public void setUp() throws Exception {
-        super.setUp();
-        input = new StringReader("abcde");
+  private StringReader input;
+  
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    input = new StringReader("abcde");
+  }
+  
+  public void testInvalidInput() throws Exception {
+    boolean gotException = false;
+    try {        
+      new NGramTokenizer(input, 2, 1);
+    } catch (IllegalArgumentException e) {
+      gotException = true;
    }
-
-    public void testInvalidInput() throws Exception {
-        boolean gotException = false;
-        try {        
-            new NGramTokenizer(input, 2, 1);
-        } catch (IllegalArgumentException e) {
-            gotException = true;
-        }
-        assertTrue(gotException);
+    assertTrue(gotException);
+  }
+  
+  public void testInvalidInput2() throws Exception {
+    boolean gotException = false;
+    try {        
+      new NGramTokenizer(input, 0, 1);
+    } catch (IllegalArgumentException e) {
+      gotException = true;
    }
-
-    public void testInvalidInput2() throws Exception {
-        boolean gotException = false;
-        try {        
-            new NGramTokenizer(input, 0, 1);
-        } catch (IllegalArgumentException e) {
-            gotException = true;
-        }
-        assertTrue(gotException);
-    }
-
-    public void testUnigrams() throws Exception {
-        NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
-        assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
-    }
-
-    public void testBigrams() throws Exception {
-        NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2);
-        assertTokenStreamContents(tokenizer, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5}, 5 /* abcde */);
-    }
-
-    public void testNgrams() throws Exception {
-        NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3);
-        assertTokenStreamContents(tokenizer,
-          new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"}, 
-          new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
-          new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
-          5 /* abcde */
+    assertTrue(gotException);
+  }
+  
+  public void testUnigrams() throws Exception {
+    NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
+    assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
+  }
+  
+  public void testBigrams() throws Exception {
+    NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2);
+    assertTokenStreamContents(tokenizer, new String[]{"ab","bc","cd","de"}, new int[]{0,1,2,3}, new int[]{2,3,4,5}, 5 /* abcde */);
+  }
+  
+  public void testNgrams() throws Exception {
+    NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3);
+    assertTokenStreamContents(tokenizer,
+        new String[]{"a","b","c","d","e", "ab","bc","cd","de", "abc","bcd","cde"}, 
+        new int[]{0,1,2,3,4, 0,1,2,3, 0,1,2},
+        new int[]{1,2,3,4,5, 2,3,4,5, 3,4,5},
+        5 /* abcde */
        );
-    }
-
-    public void testOversizedNgrams() throws Exception {
-        NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
-        assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
-    }
-    
-    public void testReset() throws Exception {
-      NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
-      assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
-      tokenizer.reset(new StringReader("abcde"));
-      assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
-    }
+  }
+  
+  public void testOversizedNgrams() throws Exception {
+    NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
+    assertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
+  }
+  
+  public void testReset() throws Exception {
+    NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
+    assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
+    tokenizer.reset(new StringReader("abcde"));
+    assertTokenStreamContents(tokenizer, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5}, 5 /* abcde */);
+  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new NGramTokenizer(reader, 2, 15);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }    
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java
@ -17,10 +17,13 @@ package org.apache.lucene.analysis.path;
 * limitations under the License.
 */

+import java.io.Reader;
 import java.io.StringReader;

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CharStream;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.charfilter.MappingCharFilter;
 import org.apache.lucene.analysis.charfilter.NormalizeCharMap;

@ -193,4 +196,16 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
        new int[]{1},
        path.length());
  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new PathHierarchyTokenizer(reader);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }    
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/path/TestReversePathHierarchyTokenizer.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/path/TestReversePathHierarchyTokenizer.java
@ -17,9 +17,13 @@ package org.apache.lucene.analysis.path;
 * limitations under the License.
 */

+import java.io.Reader;
 import java.io.StringReader;

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;

 public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {

@ -154,4 +158,16 @@ public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
        new int[]{1, 0},
        path.length());
  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new ReversePathHierarchyTokenizer(reader);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }    
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java
@ -18,14 +18,17 @@
 package org.apache.lucene.analysis.pattern;

 import java.io.IOException;
+import java.io.Reader;
 import java.io.StringReader;
 import java.util.regex.Pattern;

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CharReader;
 import org.apache.lucene.analysis.CharStream;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;

 /**
 * Tests {@link PatternReplaceCharFilter}
@ -172,4 +175,21 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
  private Pattern pattern( String p ){
    return Pattern.compile( p );
  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }
+
+      @Override
+      protected Reader initReader(Reader reader) {
+        return new PatternReplaceCharFilter(Pattern.compile("a"), "b", CharReader.get(reader));
+      }
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilter.java
@ -17,10 +17,13 @@

 package org.apache.lucene.analysis.pattern;

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;

+import java.io.Reader;
 import java.io.StringReader;
 import java.util.regex.Pattern;

@ -77,5 +80,28 @@ public class TestPatternReplaceFilter extends BaseTokenStreamTestCase {
    assertTokenStreamContents(ts,
        new String[] { "aa$fooaa$fooa$foo$", "a$", "caaaaaaaaa$" });
  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        TokenStream filter = new PatternReplaceFilter(tokenizer, Pattern.compile("a"), "b", false);
+        return new TokenStreamComponents(tokenizer, filter);
+      }    
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+    
+    Analyzer b = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        TokenStream filter = new PatternReplaceFilter(tokenizer, Pattern.compile("a"), "b", true);
+        return new TokenStreamComponents(tokenizer, filter);
+      }    
+    };
+    checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
+  }

 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java
@ -18,17 +18,22 @@
 package org.apache.lucene.analysis.pattern;

 import java.io.IOException;
+import java.io.Reader;
 import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.regex.Pattern;

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CharReader;
 import org.apache.lucene.analysis.CharStream;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
 import org.apache.lucene.analysis.charfilter.MappingCharFilter;
 import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
+import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

 public class TestPatternTokenizer extends BaseTokenStreamTestCase 
@ -117,4 +122,35 @@ public class TestPatternTokenizer extends BaseTokenStreamTestCase
    in.close();
    return out.toString();
  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = null;
+        try {
+          tokenizer = new PatternTokenizer(reader, Pattern.compile("a"), -1);
+        } catch (IOException e) {
+          throw new RuntimeException(e);
+        }
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }    
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+    
+    Analyzer b = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = null;
+        try {
+          tokenizer = new PatternTokenizer(reader, Pattern.compile("a"), 0);
+        } catch (IOException e) {
+          throw new RuntimeException(e);
+        }
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }    
+    };
+    checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
+  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java
@ -17,11 +17,14 @@

 package org.apache.lucene.analysis.reverse;

+import java.io.Reader;
 import java.io.StringReader;

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.util.Version;

 public class TestReverseStringFilter extends BaseTokenStreamTestCase {
@ -96,4 +99,16 @@ public class TestReverseStringFilter extends BaseTokenStreamTestCase {
    ReverseStringFilter.reverse(TEST_VERSION_CURRENT, buffer, 3, 7);
    assertEquals("abcfed𩬅愯瀛", new String(buffer));
  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new ReverseStringFilter(TEST_VERSION_CURRENT, tokenizer));
+      }
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
@ -18,9 +18,12 @@ package org.apache.lucene.analysis.shingle;
 */

 import java.io.IOException;
+import java.io.Reader;
 import java.io.StringReader;

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
@ -1129,4 +1132,16 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
    token.setPositionIncrement(positionIncrement);
    return token;
  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new ShingleFilter(tokenizer));
+      }
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java
@ -160,7 +160,7 @@ public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
        hasSentence = false;
        clearAttributes();
        termAtt.copyBuffer(buffer, sentenceStart, sentenceEnd-sentenceStart);
-        offsetAtt.setOffset(offset+sentenceStart, offset+sentenceEnd);
+        offsetAtt.setOffset(correctOffset(offset+sentenceStart), correctOffset(offset+sentenceEnd));
        return true;
      } else {
        return false;
@ -215,7 +215,7 @@ public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
      
      clearAttributes();
      termAtt.copyBuffer(buffer, wordStart, wordEnd-wordStart);
-      offsetAtt.setOffset(offset+wordStart, offset+wordEnd);
+      offsetAtt.setOffset(correctOffset(offset+wordStart), correctOffset(offset+wordEnd));
      posIncAtt.setPositionIncrement(posIncAtt.getPositionIncrement() + posBoost);
      posBoost = 0;
      return true;
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerTest.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerTest.java
@ -18,12 +18,15 @@

 package org.apache.lucene.analysis.wikipedia;

+import java.io.Reader;
 import java.io.StringReader;
 import java.io.IOException;
 import java.util.Set;
 import java.util.HashSet;

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;

 import static org.apache.lucene.analysis.wikipedia.WikipediaTokenizer.*;
@ -169,4 +172,17 @@ public class WikipediaTokenizerTest extends BaseTokenStreamTestCase {
    assertFalse(tf.incrementToken());
    tf.close();
  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new WikipediaTokenizer(reader);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      } 
+    };
+    checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+  }
 }
--- a/modules/analysis/icu/build.xml
+++ b/modules/analysis/icu/build.xml
@ -112,7 +112,24 @@ are part of the ICU4C package. See http://site.icu-project.org/ </echo>
      </assertions>
    </java>
  </target>
-			
+
+  <property name="html.strip.charfilter.supp.macros.output.file"
+            location="../common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro"/>
+
+  <target name="gen-html-strip-charfilter-supp-macros" depends="compile-tools">
+    <java
+        classname="org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros"
+        dir="."
+        fork="true"
+        failonerror="true"
+        output="${html.strip.charfilter.supp.macros.output.file}">
+      <classpath>
+        <path refid="additional.dependencies"/>
+        <pathelement location="${build.dir}/classes/tools"/>
+      </classpath>
+    </java>
+  </target>
+
  <target name="compile-tools" depends="common.compile-tools">
    <compile
      srcdir="src/tools/java"
--- a/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
+++ b/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
@ -111,7 +111,7 @@ public final class ICUTokenizer extends Tokenizer {
  @Override
  public void end() throws IOException {
    final int finalOffset = (length < 0) ? offset : offset + length;
-    offsetAtt.setOffset(finalOffset, finalOffset);
+    offsetAtt.setOffset(correctOffset(finalOffset), correctOffset(finalOffset));
  }  

  /*
--- a/modules/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateHTMLStripCharFilterSupplementaryMacros.java
+++ b/modules/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateHTMLStripCharFilterSupplementaryMacros.java
@ -0,0 +1,110 @@
+package org.apache.lucene.analysis.icu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.DateFormat;
+import java.util.*;
+
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.text.UnicodeSetIterator;
+import com.ibm.icu.util.VersionInfo;
+
+/** creates a macro to augment jflex's unicode support for > BMP */
+public class GenerateHTMLStripCharFilterSupplementaryMacros {
+  private static final UnicodeSet BMP = new UnicodeSet("[\u0000-\uFFFF]");
+  private static final String NL = System.getProperty("line.separator");
+  private static final DateFormat DATE_FORMAT = DateFormat.getDateTimeInstance
+      (DateFormat.FULL, DateFormat.FULL, Locale.US);
+  static {
+    DATE_FORMAT.setTimeZone(TimeZone.getTimeZone("UTC"));
+  }
+
+  private static final String APACHE_LICENSE
+      = "/*" + NL
+      + " * Copyright 2010 The Apache Software Foundation." + NL
+      + " *" + NL
+      + " * Licensed under the Apache License, Version 2.0 (the \"License\");" + NL
+      + " * you may not use this file except in compliance with the License." + NL
+      + " * You may obtain a copy of the License at" + NL
+      + " *" + NL
+      + " *      http://www.apache.org/licenses/LICENSE-2.0" + NL
+      + " *" + NL
+      + " * Unless required by applicable law or agreed to in writing, software" + NL
+      + " * distributed under the License is distributed on an \"AS IS\" BASIS," + NL
+      + " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
+      + " * See the License for the specific language governing permissions and" + NL
+      + " * limitations under the License." + NL
+      + " */" + NL + NL;
+
+
+  public static void main(String args[]) throws Exception {
+    outputHeader();
+    outputMacro("ID_Start_Supp", "[:ID_Start:]");
+    outputMacro("ID_Continue_Supp", "[:ID_Continue:]");
+  }
+
+  static void outputHeader() {
+    System.out.print(APACHE_LICENSE);
+    System.out.print("// Generated using ICU4J " + VersionInfo.ICU_VERSION.toString() + " on ");
+    System.out.println(DATE_FORMAT.format(new Date()));
+    System.out.println("// by " + GenerateHTMLStripCharFilterSupplementaryMacros.class.getName());
+    System.out.print(NL + NL);
+  }
+
+  // we have to carefully output the possibilities as compact utf-16
+  // range expressions, or jflex will OOM!
+  static void outputMacro(String name, String pattern) {
+    UnicodeSet set = new UnicodeSet(pattern);
+    set.removeAll(BMP);
+    System.out.println(name + " = (");
+    // if the set is empty, we have to do this or jflex will barf
+    if (set.isEmpty()) {
+      System.out.println("\t  []");
+    }
+
+    HashMap<Character,UnicodeSet> utf16ByLead = new HashMap<Character,UnicodeSet>();
+    for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) {
+      char utf16[] = Character.toChars(it.codepoint);
+      UnicodeSet trails = utf16ByLead.get(utf16[0]);
+      if (trails == null) {
+        trails = new UnicodeSet();
+        utf16ByLead.put(utf16[0], trails);
+      }
+      trails.add(utf16[1]);
+    }
+    
+    Map<String,UnicodeSet> utf16ByTrail = new HashMap<String,UnicodeSet>();
+    for (Map.Entry<Character,UnicodeSet> entry : utf16ByLead.entrySet()) {
+      String trail = entry.getValue().getRegexEquivalent();
+      UnicodeSet leads = utf16ByTrail.get(trail);
+      if (leads == null) {
+        leads = new UnicodeSet();
+        utf16ByTrail.put(trail, leads);
+      }
+      leads.add(entry.getKey());
+    }
+
+    boolean isFirst = true;
+    for (Map.Entry<String,UnicodeSet> entry : utf16ByTrail.entrySet()) {
+      System.out.print( isFirst ? "\t  " : "\t| ");
+      isFirst = false;
+      System.out.println(entry.getValue().getRegexEquivalent() + entry.getKey());
+    }
+    System.out.println(")");
+  }
+}
--- a/modules/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
+++ b/modules/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
@ -102,4 +102,9 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
    assertPOSToken(ts, "list",  "subst:sg:loc.voc:m3");
    assertPOSToken(ts, "lista", "subst:sg:dat.loc:f");
  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandom() throws Exception {
+    checkRandomData(random, getTestAnalyzer(), 10000 * RANDOM_MULTIPLIER); 
+  }
 }
--- a/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterTest.java
+++ b/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterTest.java
@ -16,11 +16,17 @@
 */
 package org.apache.lucene.analysis.phonetic;

+import java.io.Reader;
 import java.io.StringReader;

+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.util._TestUtil;

 public class DoubleMetaphoneFilterTest extends BaseTokenStreamTestCase {

@ -65,4 +71,28 @@ public class DoubleMetaphoneFilterTest extends BaseTokenStreamTestCase {
    assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" });
  }

+  public void testRandom() throws Exception {
+    final int codeLen = _TestUtil.nextInt(random, 1, 8);
+    Analyzer a = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new DoubleMetaphoneFilter(tokenizer, codeLen, false));
+      }
+      
+    };
+    checkRandomData(random, a, 1000 * RANDOM_MULTIPLIER);
+    
+    Analyzer b = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new DoubleMetaphoneFilter(tokenizer, codeLen, true));
+      }
+      
+    };
+    checkRandomData(random, b, 1000 * RANDOM_MULTIPLIER); 
+  }
 }
--- a/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java
+++ b/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java
@ -17,6 +17,8 @@

 package org.apache.lucene.analysis.phonetic;

+import java.io.IOException;
+import java.io.Reader;
 import java.io.StringReader;

 import org.apache.commons.codec.Encoder;
@ -25,7 +27,9 @@ import org.apache.commons.codec.language.DoubleMetaphone;
 import org.apache.commons.codec.language.Metaphone;
 import org.apache.commons.codec.language.RefinedSoundex;
 import org.apache.commons.codec.language.Soundex;
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;

@ -70,4 +74,33 @@ public class TestPhoneticFilter extends BaseTokenStreamTestCase {
    PhoneticFilter filter = new PhoneticFilter(tokenizer, encoder, inject);
    assertTokenStreamContents(filter, expected);
  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws IOException {
+    Encoder encoders[] = new Encoder[] {
+      new Metaphone(), new DoubleMetaphone(), new Soundex(), new RefinedSoundex(), new Caverphone()
+    };
+    
+    for (final Encoder e : encoders) {
+      Analyzer a = new Analyzer() {
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+          Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+          return new TokenStreamComponents(tokenizer, new PhoneticFilter(tokenizer, e, false));
+        }   
+      };
+      
+      checkRandomData(random, a, 1000*RANDOM_MULTIPLIER);
+      
+      Analyzer b = new Analyzer() {
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+          Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+          return new TokenStreamComponents(tokenizer, new PhoneticFilter(tokenizer, e, false));
+        }   
+      };
+      
+      checkRandomData(random, b, 1000*RANDOM_MULTIPLIER);
+    }
+  }
 }
--- a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java
+++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java
@ -199,9 +199,6 @@ public abstract class PerfTask implements Cloneable {
    return new String(c);
  }
  
-  /* (non-Javadoc)
-   * @see java.lang.Object#toString()
-   */
  @Override
  public String toString() {
    String padd = getPadding();
@ -248,22 +245,23 @@ public abstract class PerfTask implements Cloneable {
  }

  /**
-   * Task setup work that should not be measured for that specific task.
-   * By default it does nothing, but tasks can implement this, moving work from 
-   * doLogic() to this method. Only the work done in doLogicis measured for this task.
-   * Notice that higher level (sequence) tasks containing this task would then 
-   * measure larger time than the sum of their contained tasks.
-   * @throws Exception 
+   * Task setup work that should not be measured for that specific task. By
+   * default it does nothing, but tasks can implement this, moving work from
+   * {@link #doLogic()} to this method. Only the work done in {@link #doLogic()}
+   * is measured for this task. Notice that higher level (sequence) tasks
+   * containing this task would then measure larger time than the sum of their
+   * contained tasks.
   */
  public void setup () throws Exception {
  }
-  
+
  /**
-   * Task tearDown work that should not be measured for that specific task.
-   * By default it does nothing, but tasks can implement this, moving work from 
-   * doLogic() to this method. Only the work done in doLogicis measured for this task.
-   * Notice that higher level (sequence) tasks containing this task would then 
-   * measure larger time than the sum of their contained tasks.
+   * Task tearDown work that should not be measured for that specific task. By
+   * default it does nothing, but tasks can implement this, moving work from
+   * {@link #doLogic()} to this method. Only the work done in {@link #doLogic()}
+   * is measured for this task. Notice that higher level (sequence) tasks
+   * containing this task would then measure larger time than the sum of their
+   * contained tasks.
   */
  public void tearDown() throws Exception {
    if (++logStepCount % logStep == 0) {
@ -274,16 +272,20 @@ public abstract class PerfTask implements Cloneable {
  }

  /**
-   * Sub classes that supports parameters must override this method to return true.
+   * Sub classes that support parameters must override this method to return
+   * true.
+   * 
   * @return true iff this task supports command line params.
   */
  public boolean supportsParams () {
    return false;
  }
-  
+
  /**
   * Set the params of this task.
-   * @exception UnsupportedOperationException for tasks supporting command line parameters.
+   * 
+   * @exception UnsupportedOperationException
+   *              for tasks supporting command line parameters.
   */
  public void setParams(String params) {
    if (!supportsParams()) {
--- a/modules/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java
+++ b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java
@ -4,6 +4,7 @@ import java.io.IOException;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Map.Entry;
+import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.locks.ReadWriteLock;
 import java.util.concurrent.locks.ReentrantReadWriteLock;
 import java.util.logging.Level;
@ -100,6 +101,9 @@ public class DirectoryTaxonomyReader implements TaxonomyReader {

  private volatile boolean closed = false;
  
+  // set refCount to 1 at start
+  private final AtomicInteger refCount = new AtomicInteger(1);
+  
  /**
   * Open for reading a taxonomy stored in a given {@link Directory}.
   * @param directory
@ -130,7 +134,7 @@ public class DirectoryTaxonomyReader implements TaxonomyReader {
   * @throws AlreadyClosedException if this IndexReader is closed
   */
  protected final void ensureOpen() throws AlreadyClosedException {
-    if (indexReader.getRefCount() <= 0) {
+    if (getRefCount() <= 0) {
      throw new AlreadyClosedException("this TaxonomyReader is closed");
    }
  }
@ -415,8 +419,12 @@ public class DirectoryTaxonomyReader implements TaxonomyReader {

  public void close() throws IOException {
    if (!closed) {
-      decRef();
-      closed = true;
+      synchronized (this) {
+        if (!closed) {
+          decRef();
+          closed = true;
+        }
+      }
    }
  }
  
@ -555,27 +563,31 @@ public class DirectoryTaxonomyReader implements TaxonomyReader {
  }

  /**
-   * Expert: decreases the refCount of this TaxonomyReader instance. 
-   * If the refCount drops to 0, then pending changes (if any) are 
-   * committed to the taxonomy index and this reader is closed. 
-   * @throws IOException 
+   * Expert: decreases the refCount of this TaxonomyReader instance. If the
+   * refCount drops to 0, then this reader is closed.
   */
  public void decRef() throws IOException {
    ensureOpen();
-    if (indexReader.getRefCount() == 1) {
-      // Do not decRef the indexReader - doClose does it by calling reader.close()
-      doClose();
-    } else {
-      indexReader.decRef();
+    final int rc = refCount.decrementAndGet();
+    if (rc == 0) {
+      boolean success = false;
+      try {
+        doClose();
+        success = true;
+      } finally {
+        if (!success) {
+          // Put reference back on failure
+          refCount.incrementAndGet();
+        }
+      }
+    } else if (rc < 0) {
+      throw new IllegalStateException("too many decRef calls: refCount is " + rc + " after decrement");
    }
  }
  
-  /**
-   * Expert: returns the current refCount for this taxonomy reader
-   */
+  /** Expert: returns the current refCount for this taxonomy reader */
  public int getRefCount() {
-    ensureOpen();
-    return this.indexReader.getRefCount();
+    return refCount.get();
  }
  
  /**
@ -587,6 +599,6 @@ public class DirectoryTaxonomyReader implements TaxonomyReader {
   */
  public void incRef() {
    ensureOpen();
-    this.indexReader.incRef();
+    refCount.incrementAndGet();
  }
 }
--- a/modules/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java
+++ b/modules/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java
@ -11,6 +11,7 @@ import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
 import org.apache.lucene.store.AlreadyClosedException;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.LuceneTestCase;
 import org.junit.Test;
@ -178,4 +179,28 @@ public class TestDirectoryTaxonomyReader extends LuceneTestCase {
    }
  }
  
+  @Test
+  public void testRefreshAndRefCount() throws Exception {
+    Directory dir = new RAMDirectory(); // no need for random directories here
+
+    DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(dir);
+    taxoWriter.addCategory(new CategoryPath("a"));
+    taxoWriter.commit();
+
+    DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(dir);
+    assertEquals("wrong refCount", 1, taxoReader.getRefCount());
+
+    taxoReader.incRef();
+    assertEquals("wrong refCount", 2, taxoReader.getRefCount());
+
+    taxoWriter.addCategory(new CategoryPath("a", "b"));
+    taxoWriter.commit();
+    taxoReader.refresh();
+    assertEquals("wrong refCount", 2, taxoReader.getRefCount());
+
+    taxoWriter.close();
+    taxoReader.close();
+    dir.close();
+  }
+
 }
--- a/modules/join/src/java/org/apache/lucene/search/join/package.html
+++ b/modules/join/src/java/org/apache/lucene/search/join/package.html
@ -42,7 +42,7 @@
 <h2>Search-time joins</h2>

 <p>
-  The query time joining is terms based and implemented as two pass search. The first pass collects all the terms from a fromField
+  The query time joining is index term based and implemented as two pass search. The first pass collects all the terms from a fromField
  that match the fromQuery. The second pass returns all documents that have matching terms in a toField to the terms
  collected in the first pass.
 </p>
@ -62,7 +62,7 @@
 <pre class="prettyprint">
  String fromField = "from"; // Name of the from field
  boolean multipleValuesPerDocument = false; // Set only yo true in the case when your fromField has multiple values per document in your index
-  String fromField = "to"; // Name of the to field
+  String toField = "to"; // Name of the to field
  Query fromQuery = new TermQuery(new Term("content", searchTerm)); // Query executed to collect from values to join to the to values

  MultiTermQuery joinQuery = JoinUtil.createJoinQuery(fromField, multipleValuesPerDocument, toField, fromQuery, fromSearcher);
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -24,11 +24,11 @@ $Id$
 ==================  4.0.0-dev ==================
 Versions of Major Components
 ---------------------
-Apache Tika 0.10
+Apache Tika 1.0
 Carrot2 3.5.0
 Velocity 1.6.4 and Velocity Tools 2.0
 Apache UIMA 2.3.1
-Apache ZooKeeper 3.3.3
+Apache ZooKeeper 3.3.4


 Upgrading from Solr 3.6-dev
@ -401,6 +401,14 @@ Upgrading from Solr 3.5
 * As doGet() methods in SimplePostTool was changed to static, the client applications of this
  class need to be recompiled.

+* In Solr version 3.5 and earlier, HTMLStripCharFilter had known bugs in the
+  character offsets it provided, triggering e.g. exceptions in highlighting.
+  HTMLStripCharFilter has been re-implemented, addressing this and other
+  issues.  See the entry for LUCENE-3690 in the Bug Fixes section below for a
+  detailed list of changes.  For people who depend on the behavior of
+  HTMLStripCharFilter in Solr version 3.5 and earlier: the old implementation
+  (bugs and all) is preserved as LegacyHTMLStripCharFilter.
+
 New Features
 ----------------------
 * SOLR-2904: BinaryUpdateRequestHandler should be able to accept multiple update requests from
@ -442,6 +450,10 @@ New Features
 * SOLR-1709: Distributed support for Date and Numeric Range Faceting
  (Peter Sturge, David Smiley, hossman, Simon Willnauer)

+* SOLR-3054, LUCENE-3671: Add TypeTokenFilterFactory that creates TypeTokenFilter
+  that filters tokens based on their TypeAttribute.  (Tommaso Teofili via
+  Uwe Schindler)
+
 Optimizations
 ----------------------
 * SOLR-1931: Speedup for LukeRequestHandler and admin/schema browser. New parameter
@ -483,6 +495,52 @@ Bug Fixes

 * SOLR-2970: CSV ResponseWriter returns fields defined as stored=false in schema (janhoy)

+* LUCENE-3690, LUCENE-2208, SOLR-882, SOLR-42: Re-implemented
+  HTMLStripCharFilter as a JFlex-generated scanner.  See below for a list
+  of bug fixes and other changes.  To get the same behavior as
+  HTMLStripCharFilter in Solr version 3.5 and earlier (including the bugs),
+  use LegacyHTMLStripCharFilter, which is the previous implementation.
+
+  Behavior changes from the previous version:
+
+  - Known offset bugs are fixed.
+  - The "Mark invalid" exceptions reported in SOLR-1283 are no longer
+    triggered (the bug is still present in LegacyHTMLStripCharFilter).
+  - The character entity "&apos;" is now always properly decoded.
+  - More cases of <script> tags are now properly stripped.
+  - CDATA sections are now handled properly.
+  - Valid tag name characters now include the supplementary Unicode characters
+    from Unicode character classes [:ID_Start:] and [:ID_Continue:].
+  - Uppercase character entities "&QUOT;", "&COPY;", "&GT;", "&LT;", "&REG;",
+    and "&AMP;" are now recognized and handled as if they were in lowercase.
+  - The REPLACEMENT CHARACTER U+FFFD is now used to replace numeric character 
+    entities for unpaired UTF-16 low and high surrogates (in the range
+    [U+D800-U+DFFF]).
+  - Properly paired numeric character entities for UTF-16 surrogates are now
+    converted to the corresponding code units.
+  - Opening tags with unbalanced quotation marks are now properly stripped.
+  - Literal "<" and ">" characters in opening tags, regardless of whether they
+    appear inside quotation marks, now inhibit recognition (and stripping) of
+    the tags.  The only exception to this is for values of event-handler
+    attributes, e.g. "onClick", "onLoad", "onSelect".
+  - A newline '\n' is substituted instead of a space for stripped HTML markup.
+  - Nothing is substituted for opening and closing inline tags - they are
+    simply removed.  The list of inline tags is (case insensitively): <a>,
+    <abbr>, <acronym>, <b>, <basefont>, <bdo>, <big>, <cite>, <code>, <dfn>,
+    <em>, <font>, <i>, <img>, <input>, <kbd>, <label>, <q>, <s>, <samp>,
+    <select>, <small>, <span>, <strike>, <strong>, <sub>, <sup>, <textarea>,
+    <tt>, <u>, and <var>.
+  - HTMLStripCharFilterFactory now handles HTMLStripCharFilter's "escapedTags"
+    feature: opening and closing tags with the given names, including any
+    attributes and their values, are left intact in the output.
+  (Steve Rowe)
+
+* LUCENE-3717: Fixed offset bugs in TrimFilter, WordDelimiterFilter, and
+  HyphenatedWordsFilter where they would create invalid offsets in
+  some situations, leading to problems in highlighting.  (Robert Muir)
+
+* SOLR-2280: commitWithin ignored for a delete query (Juan Grande via janhoy)
+
 Other Changes
 ----------------------
 * SOLR-2922: Upgrade commons-io and commons-lang to 2.1 and 2.6, respectively. (koji)
@ -498,6 +556,8 @@ Other Changes
 * SOLR-2718: Add ability to lazy load response writers, defined with startup="lazy".
  (ehatcher)

+* SOLR-2901: Upgrade Solr to Tika 1.0 (janhoy)
+
 Build
 ----------------------
 * SOLR-2487: Add build target to package war without slf4j jars (janhoy)
--- a/solr/build.xml
+++ b/solr/build.xml
@ -482,7 +482,7 @@
          <packageset dir="contrib/langid/src/java"/>
          <packageset dir="contrib/uima/src/java"/>
          <group title="Core" packages="org.apache.*" />
-          <group title="SolrJ" packages="org.apache.solr.common.*,org.apache.solr.client.solrj*" />
+          <group title="SolrJ" packages="org.apache.solr.common.*,org.apache.solr.client.solrj.*,org.apache.zookeeper.*" />
          <group title="contrib: Clustering" packages="org.apache.solr.handler.clustering*" />
          <group title="contrib: DataImportHandler" packages="org.apache.solr.handler.dataimport*" />
          <group title="contrib: Solr Cell" packages="org.apache.solr.handler.extraction*" />
--- a/solr/cloud-dev/example1.sh
+++ b/solr/cloud-dev/example1.sh
@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+cd ..
+
+rm -r -f example2
+
+rm -r -f dist
+rm -r -f build
+rm -r -f example/solr/zoo_data
+rm -f example/example.log
+
+ant example dist
+
+cp -r -f example example2
+
+
+cd example
+java -DzkRun -DnumShards=2 -DSTOP.PORT=7983 -DSTOP.KEY=key -Dbootstrap_confdir=solr/conf -jar start.jar 1>example.log 2>&1 &
+
+sleep 10
+
+cd ../example2
+java -Djetty.port=9574 -DzkRun -DzkHost=localhost:9983 -DnumShards=2 -DSTOP.PORT=6574 -DSTOP.KEY=key -jar start.jar 1>example2.log 2>&1 &
+
+
--- a/solr/cloud-dev/example2.sh
+++ b/solr/cloud-dev/example2.sh
@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+
+cd ..
+
+rm -r -f example2
+rm -r -f example3
+rm -r -f example4
+
+rm -r -f dist
+rm -r -f build
+rm -r -f example/solr/zoo_data
+rm -f example/example.log
+
+ant example dist
+
+cp -r -f example example2
+cp -r -f example example3
+cp -r -f example example4
+
+
+cd example
+java -DzkRun -DnumShards=2 -DSTOP.PORT=7983 -DSTOP.KEY=key -Dbootstrap_confdir=solr/conf -jar start.jar 1>example.log 2>&1 &
+
+sleep 10
+
+cd ../example2
+java -Djetty.port=9574 -DzkRun -DzkHost=localhost:9983 -DnumShards=2 -DSTOP.PORT=6574 -DSTOP.KEY=key -jar start.jar 1>example2.log 2>&1 &
+
+cd ../example3
+java -Djetty.port=9575 -DzkRun -DzkHost=localhost:9983 -DnumShards=2 -DSTOP.PORT=6575 -DSTOP.KEY=key -jar start.jar 1>example3.log 2>&1 &
+
+cd ../example4
+java -Djetty.port=9576 -DzkHost=localhost:9983 -DnumShards=2 -DSTOP.PORT=6576 -DSTOP.KEY=key -jar start.jar 1>example4.log 2>&1 &
+
--- a/solr/cloud-dev/example3.sh
+++ b/solr/cloud-dev/example3.sh
@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+
+cd ..
+
+rm -r -f example2
+rm -r -f example3
+rm -r -f example4
+
+rm -r -f dist
+rm -r -f build
+rm -r -f example/solr/zoo_data
+rm -f example/example.log
+
+ant example dist
+
+cp -r -f example example2
+cp -r -f example example3
+cp -r -f example example4
+
+
+cd example
+java -DzkRun -DnumShards=2 -DSTOP.PORT=7983 -DSTOP.KEY=key -Dbootstrap_confdir=solr/conf -DzkHost=localhost:9983,localhost:14574,localhost:14585 -jar start.jar 1>example.log 2>&1 &
+
+sleep 10
+
+cd ../example2
+java -Djetty.port=13574 -DzkRun -DzkHost=localhost:9983,localhost:14574,localhost:14575 -DnumShards=2 -DSTOP.PORT=6574 -DSTOP.KEY=key -jar start.jar 1>example2.log 2>&1 &
+
+cd ../example3
+java -Djetty.port=13585 -DzkRun -DzkHost=localhost:9983,localhost:14574,localhost:14585 -DnumShards=2 -DSTOP.PORT=6575 -DSTOP.KEY=key -jar start.jar 1>example3.log 2>&1 &
+
+cd ../example4
+java -Djetty.port=13596 -DzkHost=localhost:9983,localhost:14574,localhost:14585 -DnumShards=2 -DSTOP.PORT=6576 -DSTOP.KEY=key -jar start.jar 1>example4.log 2>&1 &
--- a/Show More
+++ b/Show More