- perl -pi -e 's/\t/ /g'

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@413584 13f79535-47bb-0310-9956-ffa450edef68
2006-06-12 05:46:16 +00:00 · 2006-06-12 05:46:16 +00:00 · f0bfc02d4d
parent 545088a082
commit f0bfc02d4d
7 changed files with 2529 additions and 2529 deletions
--- a/contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java
+++ b/contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java
@ -39,345 +39,345 @@ import org.apache.lucene.analysis.TokenStream;
 * @author whoschek.AT.lbl.DOT.gov
 */
 public class AnalyzerUtil {
-	
-	private AnalyzerUtil() {};
+  
+  private AnalyzerUtil() {};

-	/**
-	 * Returns a simple analyzer wrapper that logs all tokens produced by the
-	 * underlying child analyzer to the given log stream (typically System.err);
-	 * Otherwise behaves exactly like the child analyzer, delivering the very
-	 * same tokens; useful for debugging purposes on custom indexing and/or
-	 * querying.
-	 * 
-	 * @param child
-	 *            the underlying child analyzer
-	 * @param log
-	 *            the print stream to log to (typically System.err)
-	 * @param logName
-	 *            a name for this logger (typically "log" or similar)
-	 * @return a logging analyzer
-	 */
-	public static Analyzer getLoggingAnalyzer(final Analyzer child, 
-			final PrintStream log, final String logName) {
-		
-		if (child == null) 
-			throw new IllegalArgumentException("child analyzer must not be null");
-		if (log == null) 
-			throw new IllegalArgumentException("logStream must not be null");
+  /**
+   * Returns a simple analyzer wrapper that logs all tokens produced by the
+   * underlying child analyzer to the given log stream (typically System.err);
+   * Otherwise behaves exactly like the child analyzer, delivering the very
+   * same tokens; useful for debugging purposes on custom indexing and/or
+   * querying.
+   * 
+   * @param child
+   *            the underlying child analyzer
+   * @param log
+   *            the print stream to log to (typically System.err)
+   * @param logName
+   *            a name for this logger (typically "log" or similar)
+   * @return a logging analyzer
+   */
+  public static Analyzer getLoggingAnalyzer(final Analyzer child, 
+      final PrintStream log, final String logName) {
+    
+    if (child == null) 
+      throw new IllegalArgumentException("child analyzer must not be null");
+    if (log == null) 
+      throw new IllegalArgumentException("logStream must not be null");

-		return new Analyzer() {
-			public TokenStream tokenStream(final String fieldName, Reader reader) {
-				return new TokenFilter(child.tokenStream(fieldName, reader)) {
-					private int position = -1;
-					
-					public Token next() throws IOException {
-						Token token = input.next(); // from filter super class
-						log.println(toString(token));
-						return token;
-					}
-					
-					private String toString(Token token) {
-						if (token == null) return "[" + logName + ":EOS:" + fieldName + "]\n";
-						
-						position += token.getPositionIncrement();
-						return "[" + logName + ":" + position + ":" + fieldName + ":"
-								+ token.termText() + ":" + token.startOffset()
-								+ "-" + token.endOffset() + ":" + token.type()
-								+ "]";
-					}					
-				};
-			}
-		};
-	}
-	
-	
-	/**
-	 * Returns an analyzer wrapper that returns at most the first
-	 * <code>maxTokens</code> tokens from the underlying child analyzer,
-	 * ignoring all remaining tokens.
-	 * 
-	 * @param child
-	 *            the underlying child analyzer
-	 * @param maxTokens
-	 *            the maximum number of tokens to return from the underlying
-	 *            analyzer (a value of Integer.MAX_VALUE indicates unlimited)
-	 * @return an analyzer wrapper
-	 */
-	public static Analyzer getMaxTokenAnalyzer(
-			final Analyzer child, final int maxTokens) {
-		
-		if (child == null) 
-			throw new IllegalArgumentException("child analyzer must not be null");
-		if (maxTokens < 0) 
-			throw new IllegalArgumentException("maxTokens must not be negative");
-		if (maxTokens == Integer.MAX_VALUE) 
-			return child; // no need to wrap
-	
-		return new Analyzer() {
-			public TokenStream tokenStream(String fieldName, Reader reader) {
-				return new TokenFilter(child.tokenStream(fieldName, reader)) {
-					private int todo = maxTokens;
-					
-					public Token next() throws IOException {
-						return --todo >= 0 ? input.next() : null;
-					}
-				};
-			}
-		};
-	}
-	
-	
-	/**
-	 * Returns an English stemming analyzer that stems tokens from the
-	 * underlying child analyzer according to the Porter stemming algorithm. The
-	 * child analyzer must deliver tokens in lower case for the stemmer to work
-	 * properly.
-	 * <p>
-	 * Background: Stemming reduces token terms to their linguistic root form
-	 * e.g. reduces "fishing" and "fishes" to "fish", "family" and "families" to
-	 * "famili", as well as "complete" and "completion" to "complet". Note that
-	 * the root form is not necessarily a meaningful word in itself, and that
-	 * this is not a bug but rather a feature, if you lean back and think about
-	 * fuzzy word matching for a bit.
-	 * <p>
-	 * See the Lucene contrib packages for stemmers (and stop words) for German,
-	 * Russian and many more languages.
-	 * 
-	 * @param child
-	 *            the underlying child analyzer
-	 * @return an analyzer wrapper
-	 */
-	public static Analyzer getPorterStemmerAnalyzer(final Analyzer child) {
-		
-		if (child == null) 
-			throw new IllegalArgumentException("child analyzer must not be null");
-	
-		return new Analyzer() {
-			public TokenStream tokenStream(String fieldName, Reader reader) {
-				return new PorterStemFilter(
-						child.tokenStream(fieldName, reader));
-// 				/* PorterStemFilter and SnowballFilter have the same behaviour, 
-// 				but PorterStemFilter is much faster. */
-//				return new org.apache.lucene.analysis.snowball.SnowballFilter(
-//						child.tokenStream(fieldName, reader), "English");
-			}
-		};
-	}
-	
-	
-	/**
-	 * Returns an analyzer wrapper that wraps the underlying child analyzer's
-	 * token stream into a {@link SynonymTokenFilter}.
-	 * 
-	 * @param child
-	 *            the underlying child analyzer
-	 * @param synonyms
-	 *            the map used to extract synonyms for terms
-	 * @param maxSynonyms
-	 *            the maximum number of synonym tokens to return per underlying
-	 *            token word (a value of Integer.MAX_VALUE indicates unlimited)
-	 * @return a new analyzer
-	 */
-	public static Analyzer getSynonymAnalyzer(final Analyzer child, 
-			final SynonymMap synonyms, final int maxSynonyms) {
-		
-		if (child == null) 
-			throw new IllegalArgumentException("child analyzer must not be null");
-		if (synonyms == null)
-			throw new IllegalArgumentException("synonyms must not be null");
-		if (maxSynonyms < 0) 
-			throw new IllegalArgumentException("maxSynonyms must not be negative");
-		if (maxSynonyms == 0)
-			return child; // no need to wrap
-	
-		return new Analyzer() {
-			public TokenStream tokenStream(String fieldName, Reader reader) {
-				return new SynonymTokenFilter(
-					child.tokenStream(fieldName, reader), synonyms, maxSynonyms);
-			}
-		};
-	}
+    return new Analyzer() {
+      public TokenStream tokenStream(final String fieldName, Reader reader) {
+        return new TokenFilter(child.tokenStream(fieldName, reader)) {
+          private int position = -1;
+          
+          public Token next() throws IOException {
+            Token token = input.next(); // from filter super class
+            log.println(toString(token));
+            return token;
+          }
+          
+          private String toString(Token token) {
+            if (token == null) return "[" + logName + ":EOS:" + fieldName + "]\n";
+            
+            position += token.getPositionIncrement();
+            return "[" + logName + ":" + position + ":" + fieldName + ":"
+                + token.termText() + ":" + token.startOffset()
+                + "-" + token.endOffset() + ":" + token.type()
+                + "]";
+          }         
+        };
+      }
+    };
+  }
+  
+  
+  /**
+   * Returns an analyzer wrapper that returns at most the first
+   * <code>maxTokens</code> tokens from the underlying child analyzer,
+   * ignoring all remaining tokens.
+   * 
+   * @param child
+   *            the underlying child analyzer
+   * @param maxTokens
+   *            the maximum number of tokens to return from the underlying
+   *            analyzer (a value of Integer.MAX_VALUE indicates unlimited)
+   * @return an analyzer wrapper
+   */
+  public static Analyzer getMaxTokenAnalyzer(
+      final Analyzer child, final int maxTokens) {
+    
+    if (child == null) 
+      throw new IllegalArgumentException("child analyzer must not be null");
+    if (maxTokens < 0) 
+      throw new IllegalArgumentException("maxTokens must not be negative");
+    if (maxTokens == Integer.MAX_VALUE) 
+      return child; // no need to wrap
+  
+    return new Analyzer() {
+      public TokenStream tokenStream(String fieldName, Reader reader) {
+        return new TokenFilter(child.tokenStream(fieldName, reader)) {
+          private int todo = maxTokens;
+          
+          public Token next() throws IOException {
+            return --todo >= 0 ? input.next() : null;
+          }
+        };
+      }
+    };
+  }
+  
+  
+  /**
+   * Returns an English stemming analyzer that stems tokens from the
+   * underlying child analyzer according to the Porter stemming algorithm. The
+   * child analyzer must deliver tokens in lower case for the stemmer to work
+   * properly.
+   * <p>
+   * Background: Stemming reduces token terms to their linguistic root form
+   * e.g. reduces "fishing" and "fishes" to "fish", "family" and "families" to
+   * "famili", as well as "complete" and "completion" to "complet". Note that
+   * the root form is not necessarily a meaningful word in itself, and that
+   * this is not a bug but rather a feature, if you lean back and think about
+   * fuzzy word matching for a bit.
+   * <p>
+   * See the Lucene contrib packages for stemmers (and stop words) for German,
+   * Russian and many more languages.
+   * 
+   * @param child
+   *            the underlying child analyzer
+   * @return an analyzer wrapper
+   */
+  public static Analyzer getPorterStemmerAnalyzer(final Analyzer child) {
+    
+    if (child == null) 
+      throw new IllegalArgumentException("child analyzer must not be null");
+  
+    return new Analyzer() {
+      public TokenStream tokenStream(String fieldName, Reader reader) {
+        return new PorterStemFilter(
+            child.tokenStream(fieldName, reader));
+//        /* PorterStemFilter and SnowballFilter have the same behaviour, 
+//        but PorterStemFilter is much faster. */
+//        return new org.apache.lucene.analysis.snowball.SnowballFilter(
+//            child.tokenStream(fieldName, reader), "English");
+      }
+    };
+  }
+  
+  
+  /**
+   * Returns an analyzer wrapper that wraps the underlying child analyzer's
+   * token stream into a {@link SynonymTokenFilter}.
+   * 
+   * @param child
+   *            the underlying child analyzer
+   * @param synonyms
+   *            the map used to extract synonyms for terms
+   * @param maxSynonyms
+   *            the maximum number of synonym tokens to return per underlying
+   *            token word (a value of Integer.MAX_VALUE indicates unlimited)
+   * @return a new analyzer
+   */
+  public static Analyzer getSynonymAnalyzer(final Analyzer child, 
+      final SynonymMap synonyms, final int maxSynonyms) {
+    
+    if (child == null) 
+      throw new IllegalArgumentException("child analyzer must not be null");
+    if (synonyms == null)
+      throw new IllegalArgumentException("synonyms must not be null");
+    if (maxSynonyms < 0) 
+      throw new IllegalArgumentException("maxSynonyms must not be negative");
+    if (maxSynonyms == 0)
+      return child; // no need to wrap
+  
+    return new Analyzer() {
+      public TokenStream tokenStream(String fieldName, Reader reader) {
+        return new SynonymTokenFilter(
+          child.tokenStream(fieldName, reader), synonyms, maxSynonyms);
+      }
+    };
+  }

-	
-	/**
-	 * Returns (frequency:term) pairs for the top N distinct terms (aka words),
-	 * sorted descending by frequency (and ascending by term, if tied).
-	 * <p>
-	 * Example XQuery:
-	 * <pre>
-	 * declare namespace util = "java:org.apache.lucene.index.memory.AnalyzerUtil";
-	 * declare namespace analyzer = "java:org.apache.lucene.index.memory.PatternAnalyzer";
-	 * 
-	 * for $pair in util:get-most-frequent-terms(
-	 *    analyzer:EXTENDED_ANALYZER(), doc("samples/shakespeare/othello.xml"), 10)
-	 * return &lt;word word="{substring-after($pair, ':')}" frequency="{substring-before($pair, ':')}"/>
-	 * </pre>
-	 * 
-	 * @param analyzer
-	 *            the analyzer to use for splitting text into terms (aka words)
-	 * @param text
-	 *            the text to analyze
-	 * @param limit
-	 *            the maximum number of pairs to return; zero indicates 
-	 *            "as many as possible".
-	 * @return an array of (frequency:term) pairs in the form of (freq0:term0,
-	 *         freq1:term1, ..., freqN:termN). Each pair is a single string
-	 *         separated by a ':' delimiter.
-	 */
-	public static String[] getMostFrequentTerms(Analyzer analyzer, String text, int limit) {
-		if (analyzer == null) 
-			throw new IllegalArgumentException("analyzer must not be null");
-		if (text == null) 
-			throw new IllegalArgumentException("text must not be null");
-		if (limit <= 0) limit = Integer.MAX_VALUE;
-		
-		// compute frequencies of distinct terms
-		HashMap map = new HashMap();
-		TokenStream stream = analyzer.tokenStream("", new StringReader(text));
-		try {
-			Token token;
-			while ((token = stream.next()) != null) {
-				MutableInteger freq = (MutableInteger) map.get(token.termText());
-				if (freq == null) {
-					freq = new MutableInteger(1);
-					map.put(token.termText(), freq);
-				} else {
-					freq.setValue(freq.intValue() + 1);
-				}
-			}
-		} catch (IOException e) {
-			throw new RuntimeException(e);
-		} finally {
-			try {
-				stream.close();
-			} catch (IOException e2) {
-				throw new RuntimeException(e2);
-			}
-		}
-		
-		// sort by frequency, text
-		Map.Entry[] entries = new Map.Entry[map.size()];
-		map.entrySet().toArray(entries);
-		Arrays.sort(entries, new Comparator() {
-			public int compare(Object o1, Object o2) {
-				Map.Entry e1 = (Map.Entry) o1;
-				Map.Entry e2 = (Map.Entry) o2;
-				int f1 = ((MutableInteger) e1.getValue()).intValue();
-				int f2 = ((MutableInteger) e2.getValue()).intValue();
-				if (f2 - f1 != 0) return f2 - f1;
-				String s1 = (String) e1.getKey();
-				String s2 = (String) e2.getKey();
-				return s1.compareTo(s2);
-			}
-		});
-		
-		// return top N entries
-		int size = Math.min(limit, entries.length);
-		String[] pairs = new String[size];
-		for (int i=0; i < size; i++) {
-			pairs[i] = entries[i].getValue() + ":" + entries[i].getKey();
-		}
-		return pairs;
-	}
-	
-	private static final class MutableInteger {
-		private int value;
-		public MutableInteger(int value) { this.value = value; }
-		public int intValue() { return value; }
-		public void setValue(int value) { this.value = value; }
-		public String toString() { return String.valueOf(value); }
-	};
-	
-	
-	
-	// TODO: could use a more general i18n approach ala http://icu.sourceforge.net/docs/papers/text_boundary_analysis_in_java/
-	/** (Line terminator followed by zero or more whitespace) two or more times */
-	private static final Pattern PARAGRAPHS = Pattern.compile("([\\r\\n\\u0085\\u2028\\u2029][ \\t\\x0B\\f]*){2,}");
-	
-	/**
-	 * Returns at most the first N paragraphs of the given text. Delimiting
-	 * characters are excluded from the results. Each returned paragraph is
-	 * whitespace-trimmed via String.trim(), potentially an empty string.
-	 * 
-	 * @param text
-	 *            the text to tokenize into paragraphs
-	 * @param limit
-	 *            the maximum number of paragraphs to return; zero indicates "as
-	 *            many as possible".
-	 * @return the first N paragraphs
-	 */
-	public static String[] getParagraphs(String text, int limit) {
-		return tokenize(PARAGRAPHS, text, limit);
-	}
-		
-	private static String[] tokenize(Pattern pattern, String text, int limit) {
-		String[] tokens = pattern.split(text, limit);
-		for (int i=tokens.length; --i >= 0; ) tokens[i] = tokens[i].trim();
-		return tokens;
-	}
-	
-	
-	// TODO: don't split on floating point numbers, e.g. 3.1415 (digit before or after '.')
-	/** Divides text into sentences; Includes inverted spanish exclamation and question mark */
-	private static final Pattern SENTENCES  = Pattern.compile("[!\\.\\?\\xA1\\xBF]+");
+  
+  /**
+   * Returns (frequency:term) pairs for the top N distinct terms (aka words),
+   * sorted descending by frequency (and ascending by term, if tied).
+   * <p>
+   * Example XQuery:
+   * <pre>
+   * declare namespace util = "java:org.apache.lucene.index.memory.AnalyzerUtil";
+   * declare namespace analyzer = "java:org.apache.lucene.index.memory.PatternAnalyzer";
+   * 
+   * for $pair in util:get-most-frequent-terms(
+   *    analyzer:EXTENDED_ANALYZER(), doc("samples/shakespeare/othello.xml"), 10)
+   * return &lt;word word="{substring-after($pair, ':')}" frequency="{substring-before($pair, ':')}"/>
+   * </pre>
+   * 
+   * @param analyzer
+   *            the analyzer to use for splitting text into terms (aka words)
+   * @param text
+   *            the text to analyze
+   * @param limit
+   *            the maximum number of pairs to return; zero indicates 
+   *            "as many as possible".
+   * @return an array of (frequency:term) pairs in the form of (freq0:term0,
+   *         freq1:term1, ..., freqN:termN). Each pair is a single string
+   *         separated by a ':' delimiter.
+   */
+  public static String[] getMostFrequentTerms(Analyzer analyzer, String text, int limit) {
+    if (analyzer == null) 
+      throw new IllegalArgumentException("analyzer must not be null");
+    if (text == null) 
+      throw new IllegalArgumentException("text must not be null");
+    if (limit <= 0) limit = Integer.MAX_VALUE;
+    
+    // compute frequencies of distinct terms
+    HashMap map = new HashMap();
+    TokenStream stream = analyzer.tokenStream("", new StringReader(text));
+    try {
+      Token token;
+      while ((token = stream.next()) != null) {
+        MutableInteger freq = (MutableInteger) map.get(token.termText());
+        if (freq == null) {
+          freq = new MutableInteger(1);
+          map.put(token.termText(), freq);
+        } else {
+          freq.setValue(freq.intValue() + 1);
+        }
+      }
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    } finally {
+      try {
+        stream.close();
+      } catch (IOException e2) {
+        throw new RuntimeException(e2);
+      }
+    }
+    
+    // sort by frequency, text
+    Map.Entry[] entries = new Map.Entry[map.size()];
+    map.entrySet().toArray(entries);
+    Arrays.sort(entries, new Comparator() {
+      public int compare(Object o1, Object o2) {
+        Map.Entry e1 = (Map.Entry) o1;
+        Map.Entry e2 = (Map.Entry) o2;
+        int f1 = ((MutableInteger) e1.getValue()).intValue();
+        int f2 = ((MutableInteger) e2.getValue()).intValue();
+        if (f2 - f1 != 0) return f2 - f1;
+        String s1 = (String) e1.getKey();
+        String s2 = (String) e2.getKey();
+        return s1.compareTo(s2);
+      }
+    });
+    
+    // return top N entries
+    int size = Math.min(limit, entries.length);
+    String[] pairs = new String[size];
+    for (int i=0; i < size; i++) {
+      pairs[i] = entries[i].getValue() + ":" + entries[i].getKey();
+    }
+    return pairs;
+  }
+  
+  private static final class MutableInteger {
+    private int value;
+    public MutableInteger(int value) { this.value = value; }
+    public int intValue() { return value; }
+    public void setValue(int value) { this.value = value; }
+    public String toString() { return String.valueOf(value); }
+  };
+  
+  
+  
+  // TODO: could use a more general i18n approach ala http://icu.sourceforge.net/docs/papers/text_boundary_analysis_in_java/
+  /** (Line terminator followed by zero or more whitespace) two or more times */
+  private static final Pattern PARAGRAPHS = Pattern.compile("([\\r\\n\\u0085\\u2028\\u2029][ \\t\\x0B\\f]*){2,}");
+  
+  /**
+   * Returns at most the first N paragraphs of the given text. Delimiting
+   * characters are excluded from the results. Each returned paragraph is
+   * whitespace-trimmed via String.trim(), potentially an empty string.
+   * 
+   * @param text
+   *            the text to tokenize into paragraphs
+   * @param limit
+   *            the maximum number of paragraphs to return; zero indicates "as
+   *            many as possible".
+   * @return the first N paragraphs
+   */
+  public static String[] getParagraphs(String text, int limit) {
+    return tokenize(PARAGRAPHS, text, limit);
+  }
+    
+  private static String[] tokenize(Pattern pattern, String text, int limit) {
+    String[] tokens = pattern.split(text, limit);
+    for (int i=tokens.length; --i >= 0; ) tokens[i] = tokens[i].trim();
+    return tokens;
+  }
+  
+  
+  // TODO: don't split on floating point numbers, e.g. 3.1415 (digit before or after '.')
+  /** Divides text into sentences; Includes inverted spanish exclamation and question mark */
+  private static final Pattern SENTENCES  = Pattern.compile("[!\\.\\?\\xA1\\xBF]+");

-	/**
-	 * Returns at most the first N sentences of the given text. Delimiting
-	 * characters are excluded from the results. Each returned sentence is
-	 * whitespace-trimmed via String.trim(), potentially an empty string.
-	 * 
-	 * @param text
-	 *            the text to tokenize into sentences
-	 * @param limit
-	 *            the maximum number of sentences to return; zero indicates "as
-	 *            many as possible".
-	 * @return the first N sentences
-	 */
-	public static String[] getSentences(String text, int limit) {
-//		return tokenize(SENTENCES, text, limit); // equivalent but slower
-		int len = text.length();
-		if (len == 0) return new String[] { text };
-		if (limit <= 0) limit = Integer.MAX_VALUE;
-		
-		// average sentence length heuristic
-		String[] tokens = new String[Math.min(limit, 1 + len/40)];
-		int size = 0;
-		int i = 0;
-		
-		while (i < len && size < limit) {
-			
-			// scan to end of current sentence
-			int start = i;
-			while (i < len && !isSentenceSeparator(text.charAt(i))) i++;
-			
-			if (size == tokens.length) { // grow array
-				String[] tmp = new String[tokens.length << 1];
-				System.arraycopy(tokens, 0, tmp, 0, size);
-				tokens = tmp;
-			}
-			// add sentence (potentially empty)
-			tokens[size++] = text.substring(start, i).trim();
+  /**
+   * Returns at most the first N sentences of the given text. Delimiting
+   * characters are excluded from the results. Each returned sentence is
+   * whitespace-trimmed via String.trim(), potentially an empty string.
+   * 
+   * @param text
+   *            the text to tokenize into sentences
+   * @param limit
+   *            the maximum number of sentences to return; zero indicates "as
+   *            many as possible".
+   * @return the first N sentences
+   */
+  public static String[] getSentences(String text, int limit) {
+//    return tokenize(SENTENCES, text, limit); // equivalent but slower
+    int len = text.length();
+    if (len == 0) return new String[] { text };
+    if (limit <= 0) limit = Integer.MAX_VALUE;
+    
+    // average sentence length heuristic
+    String[] tokens = new String[Math.min(limit, 1 + len/40)];
+    int size = 0;
+    int i = 0;
+    
+    while (i < len && size < limit) {
+      
+      // scan to end of current sentence
+      int start = i;
+      while (i < len && !isSentenceSeparator(text.charAt(i))) i++;
+      
+      if (size == tokens.length) { // grow array
+        String[] tmp = new String[tokens.length << 1];
+        System.arraycopy(tokens, 0, tmp, 0, size);
+        tokens = tmp;
+      }
+      // add sentence (potentially empty)
+      tokens[size++] = text.substring(start, i).trim();

-			// scan to beginning of next sentence
-			while (i < len && isSentenceSeparator(text.charAt(i))) i++;
-		}
-		
-		if (size == tokens.length) return tokens;
-		String[] results = new String[size];
-		System.arraycopy(tokens, 0, results, 0, size);
-		return results;
-	}
+      // scan to beginning of next sentence
+      while (i < len && isSentenceSeparator(text.charAt(i))) i++;
+    }
+    
+    if (size == tokens.length) return tokens;
+    String[] results = new String[size];
+    System.arraycopy(tokens, 0, results, 0, size);
+    return results;
+  }

-	private static boolean isSentenceSeparator(char c) {
-		// regex [!\\.\\?\\xA1\\xBF]
-		switch (c) {
-			case '!': return true;
-			case '.': return true;
-			case '?': return true;
-			case 0xA1: return true; // spanish inverted exclamation mark
-			case 0xBF: return true; // spanish inverted question mark
-			default: return false;
-		}		
-	}
-	
+  private static boolean isSentenceSeparator(char c) {
+    // regex [!\\.\\?\\xA1\\xBF]
+    switch (c) {
+      case '!': return true;
+      case '.': return true;
+      case '?': return true;
+      case 0xA1: return true; // spanish inverted exclamation mark
+      case 0xBF: return true; // spanish inverted question mark
+      default: return false;
+    }   
+  }
+  
 }
--- a/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
+++ b/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
--- a/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java
+++ b/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java
@ -63,397 +63,397 @@ import org.apache.lucene.analysis.TokenStream;
 * @author whoschek.AT.lbl.DOT.gov
 */
 public class PatternAnalyzer extends Analyzer {
-	
-	/** <code>"\\W+"</code>; Divides text at non-letters (Character.isLetter(c)) */
-	public static final Pattern NON_WORD_PATTERN = Pattern.compile("\\W+");
-	
-	/** <code>"\\s+"</code>; Divides text at whitespaces (Character.isWhitespace(c)) */
-	public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
-	
-	private static final Set EXTENDED_ENGLISH_STOP_WORDS = makeStopSet(new String[] {
-		"a", "about", "above", "across", "adj", "after", "afterwards",
-		"again", "against", "albeit", "all", "almost", "alone", "along",
-		"already", "also", "although", "always", "among", "amongst", "an",
-		"and", "another", "any", "anyhow", "anyone", "anything",
-		"anywhere", "are", "around", "as", "at", "be", "became", "because",
-		"become", "becomes", "becoming", "been", "before", "beforehand",
-		"behind", "being", "below", "beside", "besides", "between",
-		"beyond", "both", "but", "by", "can", "cannot", "co", "could",
-		"down", "during", "each", "eg", "either", "else", "elsewhere",
-		"enough", "etc", "even", "ever", "every", "everyone", "everything",
-		"everywhere", "except", "few", "first", "for", "former",
-		"formerly", "from", "further", "had", "has", "have", "he", "hence",
-		"her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
-		"herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
-		"in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
-		"latter", "latterly", "least", "less", "ltd", "many", "may", "me",
-		"meanwhile", "might", "more", "moreover", "most", "mostly", "much",
-		"must", "my", "myself", "namely", "neither", "never",
-		"nevertheless", "next", "no", "nobody", "none", "noone", "nor",
-		"not", "nothing", "now", "nowhere", "of", "off", "often", "on",
-		"once one", "only", "onto", "or", "other", "others", "otherwise",
-		"our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
-		"rather", "s", "same", "seem", "seemed", "seeming", "seems",
-		"several", "she", "should", "since", "so", "some", "somehow",
-		"someone", "something", "sometime", "sometimes", "somewhere",
-		"still", "such", "t", "than", "that", "the", "their", "them",
-		"themselves", "then", "thence", "there", "thereafter", "thereby",
-		"therefor", "therein", "thereupon", "these", "they", "this",
-		"those", "though", "through", "throughout", "thru", "thus", "to",
-		"together", "too", "toward", "towards", "under", "until", "up",
-		"upon", "us", "very", "via", "was", "we", "well", "were", "what",
-		"whatever", "whatsoever", "when", "whence", "whenever",
-		"whensoever", "where", "whereafter", "whereas", "whereat",
-		"whereby", "wherefrom", "wherein", "whereinto", "whereof",
-		"whereon", "whereto", "whereunto", "whereupon", "wherever",
-		"wherewith", "whether", "which", "whichever", "whichsoever",
-		"while", "whilst", "whither", "who", "whoever", "whole", "whom",
-		"whomever", "whomsoever", "whose", "whosoever", "why", "will",
-		"with", "within", "without", "would", "xsubj", "xcal", "xauthor",
-		"xother ", "xnote", "yet", "you", "your", "yours", "yourself",
-		"yourselves"});
-		
-	/**
-	 * A lower-casing word analyzer with English stop words (can be shared
-	 * freely across threads without harm); global per class loader.
-	 */
-	public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(
-		NON_WORD_PATTERN, true, makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS));
-		
-	/**
-	 * A lower-casing word analyzer with <b>extended </b> English stop words
-	 * (can be shared freely across threads without harm); global per class
-	 * loader. The stop words are borrowed from
-	 * http://thomas.loc.gov/home/stopwords.html, see
-	 * http://thomas.loc.gov/home/all.about.inquery.html
-	 */
-	public static final PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer(
-		NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS);
-		
-	private final Pattern pattern;
-	private final boolean toLowerCase;
-	private final Set stopWords;
-	
-	/**
-	 * Constructs a new instance with the given parameters.
-	 * 
-	 * @param pattern
-	 *            a regular expression delimiting tokens
-	 * @param toLowerCase
-	 *            if <code>true</code> returns tokens after applying
-	 *            String.toLowerCase()
-	 * @param stopWords
-	 *            if non-null, ignores all tokens that are contained in the
-	 *            given stop set (after previously having applied toLowerCase()
-	 *            if applicable). For example, created via
-	 *            {@link StopFilter#makeStopSet(String[])}and/or
-	 *            {@link org.apache.lucene.analysis.WordlistLoader}as in
-	 *            <code>WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")</code>
-	 *            or <a href="http://www.unine.ch/info/clef/">other stop words
-	 *            lists </a>.
-	 */
-	public PatternAnalyzer(Pattern pattern, boolean toLowerCase, Set stopWords) {
-		if (pattern == null) 
-			throw new IllegalArgumentException("pattern must not be null");
-		
-		if (eqPattern(NON_WORD_PATTERN, pattern)) pattern = NON_WORD_PATTERN;
-		else if (eqPattern(WHITESPACE_PATTERN, pattern)) pattern = WHITESPACE_PATTERN;
-		
-		if (stopWords != null && stopWords.size() == 0) stopWords = null;
-		
-		this.pattern = pattern;
-		this.toLowerCase = toLowerCase;
-		this.stopWords = stopWords;
-	}
-	
-	/**
-	 * Creates a token stream that tokenizes the given string into token terms
-	 * (aka words).
-	 * 
-	 * @param fieldName
-	 *            the name of the field to tokenize (currently ignored).
-	 * @param text
-	 *            the string to tokenize
-	 * @return a new token stream
-	 */
-	public TokenStream tokenStream(String fieldName, String text) {
-		// Ideally the Analyzer superclass should have a method with the same signature, 
-		// with a default impl that simply delegates to the StringReader flavour. 
-		if (text == null) 
-			throw new IllegalArgumentException("text must not be null");
-		
-		TokenStream stream;
-		if (pattern == NON_WORD_PATTERN) { // fast path
-			stream = new FastStringTokenizer(text, true, toLowerCase, stopWords);
-		}
-		else if (pattern == WHITESPACE_PATTERN) { // fast path
-			stream = new FastStringTokenizer(text, false, toLowerCase, stopWords);
-		}
-		else {
-			stream = new PatternTokenizer(text, pattern, toLowerCase);
-			if (stopWords != null) stream = new StopFilter(stream, stopWords);
-		}
-		
-		return stream;
-	}
-	
-	/**
-	 * Creates a token stream that tokenizes all the text in the given Reader;
-	 * This implementation forwards to <code>tokenStream(String, String)</code> and is
-	 * less efficient than <code>tokenStream(String, String)</code>.
-	 * 
-	 * @param fieldName
-	 *            the name of the field to tokenize (currently ignored).
-	 * @param reader
-	 *            the reader delivering the text
-	 * @return a new token stream
-	 */
-	public TokenStream tokenStream(String fieldName, Reader reader) {
-		if (reader instanceof FastStringReader) { // fast path
-			return tokenStream(fieldName, ((FastStringReader)reader).getString());
-		}
-		
-		try {
-			String text = toString(reader);
-			return tokenStream(fieldName, text);
-		} catch (IOException e) {
-			throw new RuntimeException(e);
-		}
-	}
-	
-	/**
-	 * Indicates whether some other object is "equal to" this one.
-	 * 
-	 * @param other
-	 *            the reference object with which to compare.
-	 * @return true if equal, false otherwise
-	 */
-	public boolean equals(Object other) {
-		if (this  == other) return true;
-		if (this  == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false;
-		if (other == DEFAULT_ANALYZER && this  == EXTENDED_ANALYZER) return false;
-		
-		if (other instanceof PatternAnalyzer) {
-			PatternAnalyzer p2 = (PatternAnalyzer) other;
-			return 
-				toLowerCase == p2.toLowerCase &&
-				eqPattern(pattern, p2.pattern) &&
-				eq(stopWords, p2.stopWords);
-		}
-		return false;
-	}
-	
-	/**
-	 * Returns a hash code value for the object.
-	 * 
-	 * @return the hash code.
-	 */
-	public int hashCode() {
-		if (this == DEFAULT_ANALYZER) return -1218418418; // fast path
-		if (this == EXTENDED_ANALYZER) return 1303507063; // fast path
-		
-		int h = 1;
-		h = 31*h + pattern.pattern().hashCode();
-		h = 31*h + pattern.flags();
-		h = 31*h + (toLowerCase ? 1231 : 1237);
-		h = 31*h + (stopWords != null ? stopWords.hashCode() : 0);
-		return h;
-	}
-	
-	/** equality where o1 and/or o2 can be null */
-	private static boolean eq(Object o1, Object o2) {
-		return (o1 == o2) || (o1 != null ? o1.equals(o2) : false);
-	}
-	
-	/** assumes p1 and p2 are not null */
-	private static boolean eqPattern(Pattern p1, Pattern p2) {
-		return p1 == p2 || (p1.flags() == p2.flags() && p1.pattern().equals(p2.pattern()));
-	}
-		
-	/**
-	 * Reads until end-of-stream and returns all read chars, finally closes the stream.
-	 * 
-	 * @param input the input stream
-	 * @throws IOException if an I/O error occurs while reading the stream
-	 */
-	private static String toString(Reader input) throws IOException {
-		try {
-			int len = 256;
-			char[] buffer = new char[len];
-			char[] output = new char[len];
-			
-			len = 0;
-			int n;
-			while ((n = input.read(buffer)) >= 0) {
-				if (len + n > output.length) { // grow capacity
-					char[] tmp = new char[Math.max(output.length << 1, len + n)];
-					System.arraycopy(output, 0, tmp, 0, len);
-					System.arraycopy(buffer, 0, tmp, len, n);
-					buffer = output; // use larger buffer for future larger bulk reads
-					output = tmp;
-				} else {
-					System.arraycopy(buffer, 0, output, len, n);
-				}
-				len += n;
-			}
+  
+  /** <code>"\\W+"</code>; Divides text at non-letters (Character.isLetter(c)) */
+  public static final Pattern NON_WORD_PATTERN = Pattern.compile("\\W+");
+  
+  /** <code>"\\s+"</code>; Divides text at whitespaces (Character.isWhitespace(c)) */
+  public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
+  
+  private static final Set EXTENDED_ENGLISH_STOP_WORDS = makeStopSet(new String[] {
+    "a", "about", "above", "across", "adj", "after", "afterwards",
+    "again", "against", "albeit", "all", "almost", "alone", "along",
+    "already", "also", "although", "always", "among", "amongst", "an",
+    "and", "another", "any", "anyhow", "anyone", "anything",
+    "anywhere", "are", "around", "as", "at", "be", "became", "because",
+    "become", "becomes", "becoming", "been", "before", "beforehand",
+    "behind", "being", "below", "beside", "besides", "between",
+    "beyond", "both", "but", "by", "can", "cannot", "co", "could",
+    "down", "during", "each", "eg", "either", "else", "elsewhere",
+    "enough", "etc", "even", "ever", "every", "everyone", "everything",
+    "everywhere", "except", "few", "first", "for", "former",
+    "formerly", "from", "further", "had", "has", "have", "he", "hence",
+    "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
+    "herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
+    "in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
+    "latter", "latterly", "least", "less", "ltd", "many", "may", "me",
+    "meanwhile", "might", "more", "moreover", "most", "mostly", "much",
+    "must", "my", "myself", "namely", "neither", "never",
+    "nevertheless", "next", "no", "nobody", "none", "noone", "nor",
+    "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
+    "once one", "only", "onto", "or", "other", "others", "otherwise",
+    "our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
+    "rather", "s", "same", "seem", "seemed", "seeming", "seems",
+    "several", "she", "should", "since", "so", "some", "somehow",
+    "someone", "something", "sometime", "sometimes", "somewhere",
+    "still", "such", "t", "than", "that", "the", "their", "them",
+    "themselves", "then", "thence", "there", "thereafter", "thereby",
+    "therefor", "therein", "thereupon", "these", "they", "this",
+    "those", "though", "through", "throughout", "thru", "thus", "to",
+    "together", "too", "toward", "towards", "under", "until", "up",
+    "upon", "us", "very", "via", "was", "we", "well", "were", "what",
+    "whatever", "whatsoever", "when", "whence", "whenever",
+    "whensoever", "where", "whereafter", "whereas", "whereat",
+    "whereby", "wherefrom", "wherein", "whereinto", "whereof",
+    "whereon", "whereto", "whereunto", "whereupon", "wherever",
+    "wherewith", "whether", "which", "whichever", "whichsoever",
+    "while", "whilst", "whither", "who", "whoever", "whole", "whom",
+    "whomever", "whomsoever", "whose", "whosoever", "why", "will",
+    "with", "within", "without", "would", "xsubj", "xcal", "xauthor",
+    "xother ", "xnote", "yet", "you", "your", "yours", "yourself",
+    "yourselves"});
+    
+  /**
+   * A lower-casing word analyzer with English stop words (can be shared
+   * freely across threads without harm); global per class loader.
+   */
+  public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(
+    NON_WORD_PATTERN, true, makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS));
+    
+  /**
+   * A lower-casing word analyzer with <b>extended </b> English stop words
+   * (can be shared freely across threads without harm); global per class
+   * loader. The stop words are borrowed from
+   * http://thomas.loc.gov/home/stopwords.html, see
+   * http://thomas.loc.gov/home/all.about.inquery.html
+   */
+  public static final PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer(
+    NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS);
+    
+  private final Pattern pattern;
+  private final boolean toLowerCase;
+  private final Set stopWords;
+  
+  /**
+   * Constructs a new instance with the given parameters.
+   * 
+   * @param pattern
+   *            a regular expression delimiting tokens
+   * @param toLowerCase
+   *            if <code>true</code> returns tokens after applying
+   *            String.toLowerCase()
+   * @param stopWords
+   *            if non-null, ignores all tokens that are contained in the
+   *            given stop set (after previously having applied toLowerCase()
+   *            if applicable). For example, created via
+   *            {@link StopFilter#makeStopSet(String[])}and/or
+   *            {@link org.apache.lucene.analysis.WordlistLoader}as in
+   *            <code>WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")</code>
+   *            or <a href="http://www.unine.ch/info/clef/">other stop words
+   *            lists </a>.
+   */
+  public PatternAnalyzer(Pattern pattern, boolean toLowerCase, Set stopWords) {
+    if (pattern == null) 
+      throw new IllegalArgumentException("pattern must not be null");
+    
+    if (eqPattern(NON_WORD_PATTERN, pattern)) pattern = NON_WORD_PATTERN;
+    else if (eqPattern(WHITESPACE_PATTERN, pattern)) pattern = WHITESPACE_PATTERN;
+    
+    if (stopWords != null && stopWords.size() == 0) stopWords = null;
+    
+    this.pattern = pattern;
+    this.toLowerCase = toLowerCase;
+    this.stopWords = stopWords;
+  }
+  
+  /**
+   * Creates a token stream that tokenizes the given string into token terms
+   * (aka words).
+   * 
+   * @param fieldName
+   *            the name of the field to tokenize (currently ignored).
+   * @param text
+   *            the string to tokenize
+   * @return a new token stream
+   */
+  public TokenStream tokenStream(String fieldName, String text) {
+    // Ideally the Analyzer superclass should have a method with the same signature, 
+    // with a default impl that simply delegates to the StringReader flavour. 
+    if (text == null) 
+      throw new IllegalArgumentException("text must not be null");
+    
+    TokenStream stream;
+    if (pattern == NON_WORD_PATTERN) { // fast path
+      stream = new FastStringTokenizer(text, true, toLowerCase, stopWords);
+    }
+    else if (pattern == WHITESPACE_PATTERN) { // fast path
+      stream = new FastStringTokenizer(text, false, toLowerCase, stopWords);
+    }
+    else {
+      stream = new PatternTokenizer(text, pattern, toLowerCase);
+      if (stopWords != null) stream = new StopFilter(stream, stopWords);
+    }
+    
+    return stream;
+  }
+  
+  /**
+   * Creates a token stream that tokenizes all the text in the given Reader;
+   * This implementation forwards to <code>tokenStream(String, String)</code> and is
+   * less efficient than <code>tokenStream(String, String)</code>.
+   * 
+   * @param fieldName
+   *            the name of the field to tokenize (currently ignored).
+   * @param reader
+   *            the reader delivering the text
+   * @return a new token stream
+   */
+  public TokenStream tokenStream(String fieldName, Reader reader) {
+    if (reader instanceof FastStringReader) { // fast path
+      return tokenStream(fieldName, ((FastStringReader)reader).getString());
+    }
+    
+    try {
+      String text = toString(reader);
+      return tokenStream(fieldName, text);
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+  }
+  
+  /**
+   * Indicates whether some other object is "equal to" this one.
+   * 
+   * @param other
+   *            the reference object with which to compare.
+   * @return true if equal, false otherwise
+   */
+  public boolean equals(Object other) {
+    if (this  == other) return true;
+    if (this  == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false;
+    if (other == DEFAULT_ANALYZER && this  == EXTENDED_ANALYZER) return false;
+    
+    if (other instanceof PatternAnalyzer) {
+      PatternAnalyzer p2 = (PatternAnalyzer) other;
+      return 
+        toLowerCase == p2.toLowerCase &&
+        eqPattern(pattern, p2.pattern) &&
+        eq(stopWords, p2.stopWords);
+    }
+    return false;
+  }
+  
+  /**
+   * Returns a hash code value for the object.
+   * 
+   * @return the hash code.
+   */
+  public int hashCode() {
+    if (this == DEFAULT_ANALYZER) return -1218418418; // fast path
+    if (this == EXTENDED_ANALYZER) return 1303507063; // fast path
+    
+    int h = 1;
+    h = 31*h + pattern.pattern().hashCode();
+    h = 31*h + pattern.flags();
+    h = 31*h + (toLowerCase ? 1231 : 1237);
+    h = 31*h + (stopWords != null ? stopWords.hashCode() : 0);
+    return h;
+  }
+  
+  /** equality where o1 and/or o2 can be null */
+  private static boolean eq(Object o1, Object o2) {
+    return (o1 == o2) || (o1 != null ? o1.equals(o2) : false);
+  }
+  
+  /** assumes p1 and p2 are not null */
+  private static boolean eqPattern(Pattern p1, Pattern p2) {
+    return p1 == p2 || (p1.flags() == p2.flags() && p1.pattern().equals(p2.pattern()));
+  }
+    
+  /**
+   * Reads until end-of-stream and returns all read chars, finally closes the stream.
+   * 
+   * @param input the input stream
+   * @throws IOException if an I/O error occurs while reading the stream
+   */
+  private static String toString(Reader input) throws IOException {
+    try {
+      int len = 256;
+      char[] buffer = new char[len];
+      char[] output = new char[len];
+      
+      len = 0;
+      int n;
+      while ((n = input.read(buffer)) >= 0) {
+        if (len + n > output.length) { // grow capacity
+          char[] tmp = new char[Math.max(output.length << 1, len + n)];
+          System.arraycopy(output, 0, tmp, 0, len);
+          System.arraycopy(buffer, 0, tmp, len, n);
+          buffer = output; // use larger buffer for future larger bulk reads
+          output = tmp;
+        } else {
+          System.arraycopy(buffer, 0, output, len, n);
+        }
+        len += n;
+      }

-			return new String(output, 0, output.length);
-		} finally {
-			if (input != null) input.close();
-		}
-	}
-		
-	/** somewhat oversized to minimize hash collisions */
-	private static Set makeStopSet(String[] stopWords) {
-		Set stops = new HashSet(stopWords.length * 2, 0.3f); 
-		stops.addAll(Arrays.asList(stopWords));
-		return stops;
-//		return Collections.unmodifiableSet(stops);
-	}
+      return new String(output, 0, output.length);
+    } finally {
+      if (input != null) input.close();
+    }
+  }
+    
+  /** somewhat oversized to minimize hash collisions */
+  private static Set makeStopSet(String[] stopWords) {
+    Set stops = new HashSet(stopWords.length * 2, 0.3f); 
+    stops.addAll(Arrays.asList(stopWords));
+    return stops;
+//    return Collections.unmodifiableSet(stops);
+  }

-	
-	///////////////////////////////////////////////////////////////////////////////
-	// Nested classes:
-	///////////////////////////////////////////////////////////////////////////////
-	/**
-	 * The work horse; performance isn't fantastic, but it's not nearly as bad
-	 * as one might think - kudos to the Sun regex developers.
-	 */
-	private static final class PatternTokenizer extends TokenStream {
-		
-		private final String str;
-		private final boolean toLowerCase;
-		private Matcher matcher;
-		private int pos = 0;
-		private static final Locale locale = Locale.getDefault();
-		
-		public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) {
-			this.str = str;
-			this.matcher = pattern.matcher(str);
-			this.toLowerCase = toLowerCase;
-		}
+  
+  ///////////////////////////////////////////////////////////////////////////////
+  // Nested classes:
+  ///////////////////////////////////////////////////////////////////////////////
+  /**
+   * The work horse; performance isn't fantastic, but it's not nearly as bad
+   * as one might think - kudos to the Sun regex developers.
+   */
+  private static final class PatternTokenizer extends TokenStream {
+    
+    private final String str;
+    private final boolean toLowerCase;
+    private Matcher matcher;
+    private int pos = 0;
+    private static final Locale locale = Locale.getDefault();
+    
+    public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) {
+      this.str = str;
+      this.matcher = pattern.matcher(str);
+      this.toLowerCase = toLowerCase;
+    }

-		public Token next() {
-			if (matcher == null) return null;
-			
-			while (true) { // loop takes care of leading and trailing boundary cases
-				int start = pos;
-				int end;
-				boolean isMatch = matcher.find();
-				if (isMatch) {
-					end = matcher.start();
-					pos = matcher.end();
-				} else { 
-					end = str.length();
-					matcher = null; // we're finished
-				}
-				
-				if (start != end) { // non-empty match (header/trailer)
-					String text = str.substring(start, end);
-					if (toLowerCase) text = text.toLowerCase(locale);
-					return new Token(text, start, end);
-				}
-				if (!isMatch) return null;
-			}
-		}
-		
-	}	
-	
-	
-	///////////////////////////////////////////////////////////////////////////////
-	// Nested classes:
-	///////////////////////////////////////////////////////////////////////////////
-	/**
-	 * Special-case class for best performance in common cases; this class is
-	 * otherwise unnecessary.
-	 */
-	private static final class FastStringTokenizer extends TokenStream {
-		
-		private final String str;
-		private int pos;
-		private final boolean isLetter;
-		private final boolean toLowerCase;
-		private final Set stopWords;
-		private static final Locale locale = Locale.getDefault();
-		
-		public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set stopWords) {
-			this.str = str;
-			this.isLetter = isLetter;
-			this.toLowerCase = toLowerCase;
-			this.stopWords = stopWords;
-		}
+    public Token next() {
+      if (matcher == null) return null;
+      
+      while (true) { // loop takes care of leading and trailing boundary cases
+        int start = pos;
+        int end;
+        boolean isMatch = matcher.find();
+        if (isMatch) {
+          end = matcher.start();
+          pos = matcher.end();
+        } else { 
+          end = str.length();
+          matcher = null; // we're finished
+        }
+        
+        if (start != end) { // non-empty match (header/trailer)
+          String text = str.substring(start, end);
+          if (toLowerCase) text = text.toLowerCase(locale);
+          return new Token(text, start, end);
+        }
+        if (!isMatch) return null;
+      }
+    }
+    
+  } 
+  
+  
+  ///////////////////////////////////////////////////////////////////////////////
+  // Nested classes:
+  ///////////////////////////////////////////////////////////////////////////////
+  /**
+   * Special-case class for best performance in common cases; this class is
+   * otherwise unnecessary.
+   */
+  private static final class FastStringTokenizer extends TokenStream {
+    
+    private final String str;
+    private int pos;
+    private final boolean isLetter;
+    private final boolean toLowerCase;
+    private final Set stopWords;
+    private static final Locale locale = Locale.getDefault();
+    
+    public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set stopWords) {
+      this.str = str;
+      this.isLetter = isLetter;
+      this.toLowerCase = toLowerCase;
+      this.stopWords = stopWords;
+    }

-		public Token next() {
-			// cache loop instance vars (performance)
-			String s = str;
-			int len = s.length();
-			int i = pos;
-			boolean letter = isLetter;
-			
-			int start = 0;
-			String text;
-			do {
-				// find beginning of token
-				text = null;
-				while (i < len && !isTokenChar(s.charAt(i), letter)) {
-					i++;
-				}
-				
-				if (i < len) { // found beginning; now find end of token
-					start = i;
-					while (i < len && isTokenChar(s.charAt(i), letter)) {
-						i++;
-					}
-					
-					text = s.substring(start, i);
-					if (toLowerCase) text = text.toLowerCase(locale);
-//					if (toLowerCase) {						
-////						use next line once JDK 1.5 String.toLowerCase() performance regression is fixed
-////						see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809
-//						text = s.substring(start, i).toLowerCase(); 
-////						char[] chars = new char[i-start];
-////						for (int j=start; j < i; j++) chars[j-start] = Character.toLowerCase(s.charAt(j));
-////						text = new String(chars);
-//					} else {
-//						text = s.substring(start, i);
-//					}
-				}
-			} while (text != null && isStopWord(text));
-			
-			pos = i;
-			return text != null ? new Token(text, start, i) : null;
-		}
-		
-		private boolean isTokenChar(char c, boolean isLetter) {
-			return isLetter ? Character.isLetter(c) : !Character.isWhitespace(c);
-		}
-		
-		private boolean isStopWord(String text) {
-			return stopWords != null && stopWords.contains(text);
-		}
-		
-	}
+    public Token next() {
+      // cache loop instance vars (performance)
+      String s = str;
+      int len = s.length();
+      int i = pos;
+      boolean letter = isLetter;
+      
+      int start = 0;
+      String text;
+      do {
+        // find beginning of token
+        text = null;
+        while (i < len && !isTokenChar(s.charAt(i), letter)) {
+          i++;
+        }
+        
+        if (i < len) { // found beginning; now find end of token
+          start = i;
+          while (i < len && isTokenChar(s.charAt(i), letter)) {
+            i++;
+          }
+          
+          text = s.substring(start, i);
+          if (toLowerCase) text = text.toLowerCase(locale);
+//          if (toLowerCase) {            
+////            use next line once JDK 1.5 String.toLowerCase() performance regression is fixed
+////            see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809
+//            text = s.substring(start, i).toLowerCase(); 
+////            char[] chars = new char[i-start];
+////            for (int j=start; j < i; j++) chars[j-start] = Character.toLowerCase(s.charAt(j));
+////            text = new String(chars);
+//          } else {
+//            text = s.substring(start, i);
+//          }
+        }
+      } while (text != null && isStopWord(text));
+      
+      pos = i;
+      return text != null ? new Token(text, start, i) : null;
+    }
+    
+    private boolean isTokenChar(char c, boolean isLetter) {
+      return isLetter ? Character.isLetter(c) : !Character.isWhitespace(c);
+    }
+    
+    private boolean isStopWord(String text) {
+      return stopWords != null && stopWords.contains(text);
+    }
+    
+  }

-	
-	///////////////////////////////////////////////////////////////////////////////
-	// Nested classes:
-	///////////////////////////////////////////////////////////////////////////////
-	/**
-	 * A StringReader that exposes it's contained string for fast direct access.
-	 * Might make sense to generalize this to CharSequence and make it public?
-	 */
-	static final class FastStringReader extends StringReader {
+  
+  ///////////////////////////////////////////////////////////////////////////////
+  // Nested classes:
+  ///////////////////////////////////////////////////////////////////////////////
+  /**
+   * A StringReader that exposes it's contained string for fast direct access.
+   * Might make sense to generalize this to CharSequence and make it public?
+   */
+  static final class FastStringReader extends StringReader {

-		private final String s;
-		
-		FastStringReader(String s) {
-			super(s);
-			this.s = s;
-		}
-		
-		String getString() {
-			return s;
-		}
-	}
-	
+    private final String s;
+    
+    FastStringReader(String s) {
+      super(s);
+      this.s = s;
+    }
+    
+    String getString() {
+      return s;
+    }
+  }
+  
 }
--- a/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymMap.java
+++ b/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymMap.java
@ -75,325 +75,325 @@ import java.util.TreeSet;
 */
 public class SynonymMap {

-	/** the index data; Map<String word, String[] synonyms> */
-	private final HashMap table;
-	
-	private static final String[] EMPTY = new String[0];
-	
-	private static final boolean DEBUG = false;
+  /** the index data; Map<String word, String[] synonyms> */
+  private final HashMap table;
+  
+  private static final String[] EMPTY = new String[0];
+  
+  private static final boolean DEBUG = false;

-	/**
-	 * Constructs an instance, loading WordNet synonym data from the given input
-	 * stream. Finally closes the stream. The words in the stream must be in
-	 * UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.).
-	 * 
-	 * @param input
-	 *            the stream to read from (null indicates an empty synonym map)
-	 * @throws IOException
-	 *             if an error occured while reading the stream.
-	 */
-	public SynonymMap(InputStream input) throws IOException {
-		this.table = input == null ? new HashMap(0) : read(toByteArray(input));
-	}
-	
-	/**
-	 * Returns the synonym set for the given word, sorted ascending.
-	 * 
-	 * @param word
-	 *            the word to lookup (must be in lowercase).
-	 * @return the synonyms; a set of zero or more words, sorted ascending, each
-	 *         word containing lowercase characters that satisfy
-	 *         <code>Character.isLetter()</code>.
-	 */
-	public String[] getSynonyms(String word) {
-		Object syns = table.get(word);
-		if (syns == null) return EMPTY;
-		if (syns instanceof String) return new String[] {(String) syns};
-		
-		String[] synonyms = (String[]) syns;
-		String[] copy = new String[synonyms.length]; // copy for guaranteed immutability
-		System.arraycopy(synonyms, 0, copy, 0, synonyms.length);
-		return copy;
-	}
-	
-	/**
-	 * Returns a String representation of the index data for debugging purposes.
-	 * 
-	 * @return a String representation
-	 */
-	public String toString() {
-		StringBuffer buf = new StringBuffer();
-		Iterator iter = new TreeMap(table).keySet().iterator();
-		int count = 0;
-		int f0 = 0;
-		int f1 = 0;
-		int f2 = 0;
-		int f3 = 0;
-		
-		while (iter.hasNext()) {
-			String word = (String) iter.next();
-			buf.append(word + ":");
-			String[] synonyms = getSynonyms(word);
-			buf.append(Arrays.asList(synonyms));
-			buf.append("\n");
-			count += synonyms.length;
-			if (synonyms.length == 0) f0++;
-			if (synonyms.length == 1) f1++;
-			if (synonyms.length == 2) f2++;
-			if (synonyms.length == 3) f3++;
-		}
-		
-		buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3);
-		return buf.toString();
-	}
-	
-	/**
-	 * Analyzes/transforms the given word on input stream loading. This default implementation simply
-	 * lowercases the word. Override this method with a custom stemming
-	 * algorithm or similar, if desired.
-	 * 
-	 * @param word
-	 *            the word to analyze
-	 * @return the same word, or a different word (or null to indicate that the
-	 *         word should be ignored)
-	 */
-	protected String analyze(String word) {
-		return word.toLowerCase();
-	}
+  /**
+   * Constructs an instance, loading WordNet synonym data from the given input
+   * stream. Finally closes the stream. The words in the stream must be in
+   * UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.).
+   * 
+   * @param input
+   *            the stream to read from (null indicates an empty synonym map)
+   * @throws IOException
+   *             if an error occured while reading the stream.
+   */
+  public SynonymMap(InputStream input) throws IOException {
+    this.table = input == null ? new HashMap(0) : read(toByteArray(input));
+  }
+  
+  /**
+   * Returns the synonym set for the given word, sorted ascending.
+   * 
+   * @param word
+   *            the word to lookup (must be in lowercase).
+   * @return the synonyms; a set of zero or more words, sorted ascending, each
+   *         word containing lowercase characters that satisfy
+   *         <code>Character.isLetter()</code>.
+   */
+  public String[] getSynonyms(String word) {
+    Object syns = table.get(word);
+    if (syns == null) return EMPTY;
+    if (syns instanceof String) return new String[] {(String) syns};
+    
+    String[] synonyms = (String[]) syns;
+    String[] copy = new String[synonyms.length]; // copy for guaranteed immutability
+    System.arraycopy(synonyms, 0, copy, 0, synonyms.length);
+    return copy;
+  }
+  
+  /**
+   * Returns a String representation of the index data for debugging purposes.
+   * 
+   * @return a String representation
+   */
+  public String toString() {
+    StringBuffer buf = new StringBuffer();
+    Iterator iter = new TreeMap(table).keySet().iterator();
+    int count = 0;
+    int f0 = 0;
+    int f1 = 0;
+    int f2 = 0;
+    int f3 = 0;
+    
+    while (iter.hasNext()) {
+      String word = (String) iter.next();
+      buf.append(word + ":");
+      String[] synonyms = getSynonyms(word);
+      buf.append(Arrays.asList(synonyms));
+      buf.append("\n");
+      count += synonyms.length;
+      if (synonyms.length == 0) f0++;
+      if (synonyms.length == 1) f1++;
+      if (synonyms.length == 2) f2++;
+      if (synonyms.length == 3) f3++;
+    }
+    
+    buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3);
+    return buf.toString();
+  }
+  
+  /**
+   * Analyzes/transforms the given word on input stream loading. This default implementation simply
+   * lowercases the word. Override this method with a custom stemming
+   * algorithm or similar, if desired.
+   * 
+   * @param word
+   *            the word to analyze
+   * @return the same word, or a different word (or null to indicate that the
+   *         word should be ignored)
+   */
+  protected String analyze(String word) {
+    return word.toLowerCase();
+  }

-	private static boolean isValid(String str) {
-		for (int i=str.length(); --i >= 0; ) {
-			if (!Character.isLetter(str.charAt(i))) return false;
-		}
-		return true;
-	}
+  private static boolean isValid(String str) {
+    for (int i=str.length(); --i >= 0; ) {
+      if (!Character.isLetter(str.charAt(i))) return false;
+    }
+    return true;
+  }

-	private HashMap read(byte[] data) {
-		int WORDS  = (int) (76401 / 0.7); // presizing
-		int GROUPS = (int) (88022 / 0.7); // presizing
-		HashMap word2Groups = new HashMap(WORDS);  // Map<String word, int[] groups>
-		HashMap group2Words = new HashMap(GROUPS); // Map<int group, String[] words>
-		HashMap internedWords = new HashMap(WORDS);// Map<String word, String word>
+  private HashMap read(byte[] data) {
+    int WORDS  = (int) (76401 / 0.7); // presizing
+    int GROUPS = (int) (88022 / 0.7); // presizing
+    HashMap word2Groups = new HashMap(WORDS);  // Map<String word, int[] groups>
+    HashMap group2Words = new HashMap(GROUPS); // Map<int group, String[] words>
+    HashMap internedWords = new HashMap(WORDS);// Map<String word, String word>

-		Charset charset = Charset.forName("UTF-8");
-		int lastNum = -1;
-		Integer lastGroup = null;
-		int len = data.length;
-		int i=0;
-		
-		while (i < len) { // until EOF
-			/* Part A: Parse a line */
-			
-			// scan to beginning of group
-			while (i < len && data[i] != '(') i++;
-			if (i >= len) break; // EOF
-			i++;
-			
-			// parse group
-			int num = 0;
-			while (i < len && data[i] != ',') {
-				num = 10*num + (data[i] - 48);
-				i++;
-			}
-			i++;
-//			if (DEBUG) System.err.println("num="+ num);
-			
-			// scan to beginning of word
-			while (i < len && data[i] != '\'') i++;
-			i++;
-	
-			// scan to end of word
-			int start = i;
-			do {
-				while (i < len && data[i] != '\'') i++;
-				i++;
-			} while (i < len && data[i] != ','); // word must end with "',"
-			
-			if (i >= len) break; // EOF
-			String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString();
-//			String word = new String(data, 0, start, i-start-1); // ASCII
-			
-			/*
-			 * Part B: ignore phrases (with spaces and hyphens) and
-			 * non-alphabetic words, and let user customize word (e.g. do some
-			 * stemming)
-			 */
-			if (!isValid(word)) continue; // ignore
-			word = analyze(word);
-			if (word == null || word.length() == 0) continue; // ignore
-			
-			
-			/* Part C: Add (group,word) to tables */
-			
-			// ensure compact string representation, minimizing memory overhead
-			String w = (String) internedWords.get(word);
-			if (w == null) {
-				word = new String(word); // ensure compact string
-				internedWords.put(word, word);
-			} else {
-				word = w;
-			}
-			
-			Integer group = lastGroup;
-			if (num != lastNum) {
-				group = new Integer(num);
-				lastGroup = group;
-				lastNum = num;
-			}
-			
-			// add word --> group
-			ArrayList groups = (ArrayList) word2Groups.get(word);
-			if (groups == null) {
-				groups = new ArrayList(1);
-				word2Groups.put(word, groups);
-			}
-			groups.add(group);
+    Charset charset = Charset.forName("UTF-8");
+    int lastNum = -1;
+    Integer lastGroup = null;
+    int len = data.length;
+    int i=0;
+    
+    while (i < len) { // until EOF
+      /* Part A: Parse a line */
+      
+      // scan to beginning of group
+      while (i < len && data[i] != '(') i++;
+      if (i >= len) break; // EOF
+      i++;
+      
+      // parse group
+      int num = 0;
+      while (i < len && data[i] != ',') {
+        num = 10*num + (data[i] - 48);
+        i++;
+      }
+      i++;
+//      if (DEBUG) System.err.println("num="+ num);
+      
+      // scan to beginning of word
+      while (i < len && data[i] != '\'') i++;
+      i++;
+  
+      // scan to end of word
+      int start = i;
+      do {
+        while (i < len && data[i] != '\'') i++;
+        i++;
+      } while (i < len && data[i] != ','); // word must end with "',"
+      
+      if (i >= len) break; // EOF
+      String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString();
+//      String word = new String(data, 0, start, i-start-1); // ASCII
+      
+      /*
+       * Part B: ignore phrases (with spaces and hyphens) and
+       * non-alphabetic words, and let user customize word (e.g. do some
+       * stemming)
+       */
+      if (!isValid(word)) continue; // ignore
+      word = analyze(word);
+      if (word == null || word.length() == 0) continue; // ignore
+      
+      
+      /* Part C: Add (group,word) to tables */
+      
+      // ensure compact string representation, minimizing memory overhead
+      String w = (String) internedWords.get(word);
+      if (w == null) {
+        word = new String(word); // ensure compact string
+        internedWords.put(word, word);
+      } else {
+        word = w;
+      }
+      
+      Integer group = lastGroup;
+      if (num != lastNum) {
+        group = new Integer(num);
+        lastGroup = group;
+        lastNum = num;
+      }
+      
+      // add word --> group
+      ArrayList groups = (ArrayList) word2Groups.get(word);
+      if (groups == null) {
+        groups = new ArrayList(1);
+        word2Groups.put(word, groups);
+      }
+      groups.add(group);

-			// add group --> word
-			ArrayList words = (ArrayList) group2Words.get(group);
-			if (words == null) {
-				words = new ArrayList(1);
-				group2Words.put(group, words);
-			} 
-			words.add(word);
-		}
-		
-		
-		/* Part D: compute index data structure */
-		HashMap word2Syns = createIndex(word2Groups, group2Words);		
-				
-		/* Part E: minimize memory consumption by a factor 3 (or so) */
-//		if (true) return word2Syns;
-		word2Groups = null; // help gc
-		group2Words = null; // help gc		
-		return optimize(word2Syns, internedWords);
-	}
-	
-	private HashMap createIndex(Map word2Groups, Map group2Words) {
-		HashMap word2Syns = new HashMap();
-		Iterator iter = word2Groups.entrySet().iterator();
-		
-		while (iter.hasNext()) { // for each word
-			Map.Entry entry = (Map.Entry) iter.next();
-			ArrayList group = (ArrayList) entry.getValue();			
-			String word = (String) entry.getKey();
-			
-//			HashSet synonyms = new HashSet();
-			TreeSet synonyms = new TreeSet();
-			for (int i=group.size(); --i >= 0; ) { // for each groupID of word
-				ArrayList words = (ArrayList) group2Words.get(group.get(i));
-				for (int j=words.size(); --j >= 0; ) { // add all words				
-					Object synonym = words.get(j); // note that w and word are interned
-					if (synonym != word) { // a word is implicitly it's own synonym
-						synonyms.add(synonym);
-					}
-				}
-			}
+      // add group --> word
+      ArrayList words = (ArrayList) group2Words.get(group);
+      if (words == null) {
+        words = new ArrayList(1);
+        group2Words.put(group, words);
+      } 
+      words.add(word);
+    }
+    
+    
+    /* Part D: compute index data structure */
+    HashMap word2Syns = createIndex(word2Groups, group2Words);    
+        
+    /* Part E: minimize memory consumption by a factor 3 (or so) */
+//    if (true) return word2Syns;
+    word2Groups = null; // help gc
+    group2Words = null; // help gc    
+    return optimize(word2Syns, internedWords);
+  }
+  
+  private HashMap createIndex(Map word2Groups, Map group2Words) {
+    HashMap word2Syns = new HashMap();
+    Iterator iter = word2Groups.entrySet().iterator();
+    
+    while (iter.hasNext()) { // for each word
+      Map.Entry entry = (Map.Entry) iter.next();
+      ArrayList group = (ArrayList) entry.getValue();     
+      String word = (String) entry.getKey();
+      
+//      HashSet synonyms = new HashSet();
+      TreeSet synonyms = new TreeSet();
+      for (int i=group.size(); --i >= 0; ) { // for each groupID of word
+        ArrayList words = (ArrayList) group2Words.get(group.get(i));
+        for (int j=words.size(); --j >= 0; ) { // add all words       
+          Object synonym = words.get(j); // note that w and word are interned
+          if (synonym != word) { // a word is implicitly it's own synonym
+            synonyms.add(synonym);
+          }
+        }
+      }

-			int size = synonyms.size();
-			if (size > 0) {
-				String[] syns = new String[size];
-				if (size == 1)  
-					syns[0] = (String) synonyms.first();
-				else
-					synonyms.toArray(syns);
-//				if (syns.length > 1) Arrays.sort(syns);
-//				if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns));
-				word2Syns.put(word, syns);
-			}
-		}
-	
-		return word2Syns;
-	}
+      int size = synonyms.size();
+      if (size > 0) {
+        String[] syns = new String[size];
+        if (size == 1)  
+          syns[0] = (String) synonyms.first();
+        else
+          synonyms.toArray(syns);
+//        if (syns.length > 1) Arrays.sort(syns);
+//        if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns));
+        word2Syns.put(word, syns);
+      }
+    }
+  
+    return word2Syns;
+  }

-	private HashMap optimize(HashMap word2Syns, HashMap internedWords) {
-		if (DEBUG) {
-			System.err.println("before gc");
-			for (int i=0; i < 10; i++) System.gc();
-			System.err.println("after gc");
-		}
-		
-		// collect entries
-		int len = 0;
-		int size = word2Syns.size();
-		String[][] allSynonyms = new String[size][];
-		String[] words = new String[size];
-		Iterator iter = word2Syns.entrySet().iterator();
-		for (int j=0; j < size; j++) {
-			Map.Entry entry = (Map.Entry) iter.next();
-			allSynonyms[j] = (String[]) entry.getValue(); 
-			words[j] = (String) entry.getKey();
-			len += words[j].length();
-		}
-		
-		// assemble large string containing all words
-		StringBuffer buf = new StringBuffer(len);
-		for (int j=0; j < size; j++) buf.append(words[j]);
-		String allWords = new String(buf.toString()); // ensure compact string across JDK versions
-		buf = null;
-		
-		// intern words at app level via memory-overlaid substrings
-		for (int p=0, j=0; j < size; j++) {
-			String word = words[j];
-			internedWords.put(word, allWords.substring(p, p + word.length()));
-			p += word.length();
-		}
-		
-		// replace words with interned words
-		for (int j=0; j < size; j++) {
-			String[] syns = allSynonyms[j];
-			for (int k=syns.length; --k >= 0; ) {
-				syns[k] = (String) internedWords.get(syns[k]);
-			}
-			Object replacement = syns;
-			if (syns.length == 1) replacement = syns[0]; // minimize memory consumption some more
-			word2Syns.remove(words[j]);
-			word2Syns.put(internedWords.get(words[j]), replacement);
-		}
-		
-		if (DEBUG) {
-			words = null;
-			allSynonyms = null;
-			internedWords = null;
-			allWords = null;
-			System.err.println("before gc");
-			for (int i=0; i < 10; i++) System.gc();
-			System.err.println("after gc");
-		}
-		return word2Syns;
-	}
-	
-	// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
-	private static byte[] toByteArray(InputStream input) throws IOException {
-		try {
-			// safe and fast even if input.available() behaves weird or buggy
-			int len = Math.max(256, input.available());
-			byte[] buffer = new byte[len];
-			byte[] output = new byte[len];
-			
-			len = 0;
-			int n;
-			while ((n = input.read(buffer)) >= 0) {
-				if (len + n > output.length) { // grow capacity
-					byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
-					System.arraycopy(output, 0, tmp, 0, len);
-					System.arraycopy(buffer, 0, tmp, len, n);
-					buffer = output; // use larger buffer for future larger bulk reads
-					output = tmp;
-				} else {
-					System.arraycopy(buffer, 0, output, len, n);
-				}
-				len += n;
-			}
+  private HashMap optimize(HashMap word2Syns, HashMap internedWords) {
+    if (DEBUG) {
+      System.err.println("before gc");
+      for (int i=0; i < 10; i++) System.gc();
+      System.err.println("after gc");
+    }
+    
+    // collect entries
+    int len = 0;
+    int size = word2Syns.size();
+    String[][] allSynonyms = new String[size][];
+    String[] words = new String[size];
+    Iterator iter = word2Syns.entrySet().iterator();
+    for (int j=0; j < size; j++) {
+      Map.Entry entry = (Map.Entry) iter.next();
+      allSynonyms[j] = (String[]) entry.getValue(); 
+      words[j] = (String) entry.getKey();
+      len += words[j].length();
+    }
+    
+    // assemble large string containing all words
+    StringBuffer buf = new StringBuffer(len);
+    for (int j=0; j < size; j++) buf.append(words[j]);
+    String allWords = new String(buf.toString()); // ensure compact string across JDK versions
+    buf = null;
+    
+    // intern words at app level via memory-overlaid substrings
+    for (int p=0, j=0; j < size; j++) {
+      String word = words[j];
+      internedWords.put(word, allWords.substring(p, p + word.length()));
+      p += word.length();
+    }
+    
+    // replace words with interned words
+    for (int j=0; j < size; j++) {
+      String[] syns = allSynonyms[j];
+      for (int k=syns.length; --k >= 0; ) {
+        syns[k] = (String) internedWords.get(syns[k]);
+      }
+      Object replacement = syns;
+      if (syns.length == 1) replacement = syns[0]; // minimize memory consumption some more
+      word2Syns.remove(words[j]);
+      word2Syns.put(internedWords.get(words[j]), replacement);
+    }
+    
+    if (DEBUG) {
+      words = null;
+      allSynonyms = null;
+      internedWords = null;
+      allWords = null;
+      System.err.println("before gc");
+      for (int i=0; i < 10; i++) System.gc();
+      System.err.println("after gc");
+    }
+    return word2Syns;
+  }
+  
+  // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
+  private static byte[] toByteArray(InputStream input) throws IOException {
+    try {
+      // safe and fast even if input.available() behaves weird or buggy
+      int len = Math.max(256, input.available());
+      byte[] buffer = new byte[len];
+      byte[] output = new byte[len];
+      
+      len = 0;
+      int n;
+      while ((n = input.read(buffer)) >= 0) {
+        if (len + n > output.length) { // grow capacity
+          byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
+          System.arraycopy(output, 0, tmp, 0, len);
+          System.arraycopy(buffer, 0, tmp, len, n);
+          buffer = output; // use larger buffer for future larger bulk reads
+          output = tmp;
+        } else {
+          System.arraycopy(buffer, 0, output, len, n);
+        }
+        len += n;
+      }

-			if (len == output.length) return output;
-			buffer = null; // help gc
-			buffer = new byte[len];
-			System.arraycopy(output, 0, buffer, 0, len);
-			return buffer;
-		} finally {
-			if (input != null) input.close();
-		}
-	}
-	
+      if (len == output.length) return output;
+      buffer = null; // help gc
+      buffer = new byte[len];
+      System.arraycopy(output, 0, buffer, 0, len);
+      return buffer;
+    } finally {
+      if (input != null) input.close();
+    }
+  }
+  
 }
--- a/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java
+++ b/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java
@ -30,105 +30,105 @@ import org.apache.lucene.analysis.TokenStream;
 * @author whoschek.AT.lbl.DOT.gov
 */
 public class SynonymTokenFilter extends TokenFilter {
-		
-	/** The Token.type used to indicate a synonym to higher level filters. */
-	public static final String SYNONYM_TOKEN_TYPE = "SYNONYM";
+    
+  /** The Token.type used to indicate a synonym to higher level filters. */
+  public static final String SYNONYM_TOKEN_TYPE = "SYNONYM";

-	private final SynonymMap synonyms;
-	private final int maxSynonyms;
-	
-	private String[] stack = null;
-	private int index = 0;
-	private Token current = null;
-	private int todo = 0;
-	
-	/**
-	 * Creates an instance for the given underlying stream and synonym table.
-	 * 
-	 * @param input
-	 *            the underlying child token stream
-	 * @param synonyms
-	 *            the map used to extract synonyms for terms
-	 * @param maxSynonyms
-	 *            the maximum number of synonym tokens to return per underlying
-	 *            token word (a value of Integer.MAX_VALUE indicates unlimited)
-	 */
-	public SynonymTokenFilter(TokenStream input, SynonymMap synonyms, int maxSynonyms) {
-		super(input);
-		if (input == null)
-			throw new IllegalArgumentException("input must not be null");
-		if (synonyms == null)
-			throw new IllegalArgumentException("synonyms must not be null");
-		if (maxSynonyms < 0) 
-			throw new IllegalArgumentException("maxSynonyms must not be negative");
-		
-		this.synonyms = synonyms;
-		this.maxSynonyms = maxSynonyms;
-	}
-	
-	/** Returns the next token in the stream, or null at EOS. */
-	public Token next() throws IOException {
-		Token token;
-		while (todo > 0 && index < stack.length) { // pop from stack
-			token = createToken(stack[index++], current);
-			if (token != null) {
-				todo--;
-				return token;
-			}
-		}
-		
-		token = input.next();
-		if (token == null) return null; // EOS; iterator exhausted
-		
-		stack = synonyms.getSynonyms(token.termText()); // push onto stack
-		if (stack.length > maxSynonyms) randomize(stack);
-		index = 0;
-		current = token;
-		todo = maxSynonyms;
-		return token;
-	}
-	
-	/**
-	 * Creates and returns a token for the given synonym of the current input
-	 * token; Override for custom (stateless or stateful) behaviour, if desired.
-	 * 
-	 * @param synonym 
-	 *            a synonym for the current token's term
-	 * @param current
-	 *            the current token from the underlying child stream
-	 * @return a new token, or null to indicate that the given synonym should be
-	 *         ignored
-	 */
-	protected Token createToken(String synonym, Token current) {
-		Token token = new Token(
-			synonym, current.startOffset(), current.endOffset(), SYNONYM_TOKEN_TYPE);
-		token.setPositionIncrement(0);
-		return token;
-	}
-	
-	/**
-	 * Randomize synonyms to later sample a subset. Uses constant random seed
-	 * for reproducability. Uses "DRand", a simple, fast, uniform pseudo-random
-	 * number generator with medium statistical quality (multiplicative
-	 * congruential method), producing integers in the range [Integer.MIN_VALUE,
-	 * Integer.MAX_VALUE].
-	 */
-	private static void randomize(Object[] arr) {
-		int seed = 1234567; // constant
-		int randomState = 4*seed + 1;
-//		Random random = new Random(seed); // unnecessary overhead
-		int len = arr.length;
-		for (int i=0; i < len-1; i++) {
-			randomState *= 0x278DDE6D; // z(i+1)=a*z(i) (mod 2**32)
-			int r = randomState % (len-i);
-			if (r < 0) r = -r; // e.g. -9 % 2 == -1
-//			int r = random.nextInt(len-i);
-			
-			// swap arr[i, i+r]
-			Object tmp = arr[i];
-			arr[i] = arr[i + r];
-			arr[i + r] = tmp;
-		}		
-	}
-	
+  private final SynonymMap synonyms;
+  private final int maxSynonyms;
+  
+  private String[] stack = null;
+  private int index = 0;
+  private Token current = null;
+  private int todo = 0;
+  
+  /**
+   * Creates an instance for the given underlying stream and synonym table.
+   * 
+   * @param input
+   *            the underlying child token stream
+   * @param synonyms
+   *            the map used to extract synonyms for terms
+   * @param maxSynonyms
+   *            the maximum number of synonym tokens to return per underlying
+   *            token word (a value of Integer.MAX_VALUE indicates unlimited)
+   */
+  public SynonymTokenFilter(TokenStream input, SynonymMap synonyms, int maxSynonyms) {
+    super(input);
+    if (input == null)
+      throw new IllegalArgumentException("input must not be null");
+    if (synonyms == null)
+      throw new IllegalArgumentException("synonyms must not be null");
+    if (maxSynonyms < 0) 
+      throw new IllegalArgumentException("maxSynonyms must not be negative");
+    
+    this.synonyms = synonyms;
+    this.maxSynonyms = maxSynonyms;
+  }
+  
+  /** Returns the next token in the stream, or null at EOS. */
+  public Token next() throws IOException {
+    Token token;
+    while (todo > 0 && index < stack.length) { // pop from stack
+      token = createToken(stack[index++], current);
+      if (token != null) {
+        todo--;
+        return token;
+      }
+    }
+    
+    token = input.next();
+    if (token == null) return null; // EOS; iterator exhausted
+    
+    stack = synonyms.getSynonyms(token.termText()); // push onto stack
+    if (stack.length > maxSynonyms) randomize(stack);
+    index = 0;
+    current = token;
+    todo = maxSynonyms;
+    return token;
+  }
+  
+  /**
+   * Creates and returns a token for the given synonym of the current input
+   * token; Override for custom (stateless or stateful) behaviour, if desired.
+   * 
+   * @param synonym 
+   *            a synonym for the current token's term
+   * @param current
+   *            the current token from the underlying child stream
+   * @return a new token, or null to indicate that the given synonym should be
+   *         ignored
+   */
+  protected Token createToken(String synonym, Token current) {
+    Token token = new Token(
+      synonym, current.startOffset(), current.endOffset(), SYNONYM_TOKEN_TYPE);
+    token.setPositionIncrement(0);
+    return token;
+  }
+  
+  /**
+   * Randomize synonyms to later sample a subset. Uses constant random seed
+   * for reproducability. Uses "DRand", a simple, fast, uniform pseudo-random
+   * number generator with medium statistical quality (multiplicative
+   * congruential method), producing integers in the range [Integer.MIN_VALUE,
+   * Integer.MAX_VALUE].
+   */
+  private static void randomize(Object[] arr) {
+    int seed = 1234567; // constant
+    int randomState = 4*seed + 1;
+//    Random random = new Random(seed); // unnecessary overhead
+    int len = arr.length;
+    for (int i=0; i < len-1; i++) {
+      randomState *= 0x278DDE6D; // z(i+1)=a*z(i) (mod 2**32)
+      int r = randomState % (len-i);
+      if (r < 0) r = -r; // e.g. -9 % 2 == -1
+//      int r = random.nextInt(len-i);
+      
+      // swap arr[i, i+r]
+      Object tmp = arr[i];
+      arr[i] = arr[i + r];
+      arr[i + r] = tmp;
+    }   
+  }
+  
 }
--- a/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java
+++ b/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java
@ -197,319 +197,319 @@ the^3
@author whoschek.AT.lbl.DOT.gov
 */
 public class MemoryIndexTest extends TestCase {
-	
-	private Analyzer analyzer;
-	private boolean fastMode = false;
-	
-	private static final String FIELD_NAME = "content";
+  
+  private Analyzer analyzer;
+  private boolean fastMode = false;
+  
+  private static final String FIELD_NAME = "content";

-	/** Runs the tests and/or benchmark */
-	public static void main(String[] args) throws Throwable {
-		new MemoryIndexTest().run(args);		
-	}
+  /** Runs the tests and/or benchmark */
+  public static void main(String[] args) throws Throwable {
+    new MemoryIndexTest().run(args);    
+  }

-//	public void setUp() {	}
-//	public void tearDown() {}
-	
-	public void testMany() throws Throwable {
-		String[] files = listFiles(new String[] {
-			"*.txt", "*.html", "*.xml", "xdocs/*.xml", 
-			"src/java/test/org/apache/lucene/queryParser/*.java",
-			"src/java/org/apache/lucene/index/memory/*.java",
-		});
-		System.out.println("files = " + java.util.Arrays.asList(files));
-		String[] xargs = new String[] {
-			"1", "1", "memram", 
-			"@src/test/org/apache/lucene/index/memory/testqueries.txt",
-		};
-		String[] args = new String[xargs.length + files.length];
-		System.arraycopy(xargs, 0, args, 0, xargs.length);
-		System.arraycopy(files, 0, args, xargs.length, files.length);
-		run(args);
-	}
-	
-	private void run(String[] args) throws Throwable {
-		int k = -1;
-		
-		int iters = 1;
-		if (args.length > ++k) iters = Math.max(1, Integer.parseInt(args[k]));
-		
-		int runs = 1;
-		if (args.length > ++k) runs = Math.max(1, Integer.parseInt(args[k]));
-		
-		String cmd = "memram";
-		if (args.length > ++k) cmd = args[k];
-		boolean useMemIndex = cmd.indexOf("mem") >= 0;
-		boolean useRAMIndex = cmd.indexOf("ram") >= 0;
-		
-		String[] queries = { "term", "term*", "term~", "Apache", "Apach~ AND Copy*" };
-		if (args.length > ++k) {
-			String arg = args[k];
-			if (arg.startsWith("@")) 
-				queries = readLines(new File(arg.substring(1)));
-			else
-				queries = new String[] { arg };
-		}
-		
-		File[] files = new File[] {new File("CHANGES.txt"), new File("LICENSE.txt") };
-		if (args.length > ++k) {
-			files = new File[args.length - k];
-			for (int i=k; i < args.length; i++) {
-				files[i-k] = new File(args[i]);
-			}
-		}
-		
-		boolean toLowerCase = true;
-//		boolean toLowerCase = false;
-//		Set stopWords = null;
-		Set stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
-		
-		Analyzer[] analyzers = new Analyzer[] { 
-				new SimpleAnalyzer(),
-				new StopAnalyzer(),
-				new StandardAnalyzer(),
-				PatternAnalyzer.DEFAULT_ANALYZER,
-//				new WhitespaceAnalyzer(),
-//				new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, false, null),
-//				new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, true, stopWords),				
-//				new SnowballAnalyzer("English", StopAnalyzer.ENGLISH_STOP_WORDS),
-		};
-		
-		for (int iter=0; iter < iters; iter++) {
-			System.out.println("\n########### iteration=" + iter);
-			long start = System.currentTimeMillis();						
-			long bytes = 0;
-			
-			for (int anal=0; anal < analyzers.length; anal++) {
-				this.analyzer = analyzers[anal];
-				
-				for (int i=0; i < files.length; i++) {
-					File file = files[i];
-					if (!file.exists() || file.isDirectory()) continue; // ignore
-					bytes += file.length();
-					String text = toString(new FileInputStream(file), null);
-					Document doc = createDocument(text);
-					System.out.println("\n*********** FILE=" + file);
-					
-					for (int q=0; q < queries.length; q++) {
-						try {
-							Query query = parseQuery(queries[q]);
-							
-							for (int run=0; run < runs; run++) {
-								float score1 = 0.0f; float score2 = 0.0f;
-								if (useMemIndex) score1 = query(createMemoryIndex(doc), query); 
-								if (useRAMIndex) score2 = query(createRAMIndex(doc), query);
-								if (useMemIndex && useRAMIndex) {
-									System.out.println("diff="+ (score1-score2) + ", query=" + queries[q] + ", s1=" + score1 + ", s2=" + score2);
-									if (score1 != score2 || score1 < 0.0f || score2 < 0.0f || score1 > 1.0f || score2 > 1.0f) {
-										throw new IllegalStateException("BUG DETECTED:" + (i*(q+1)) + " at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer);
-									}
-								}
-							}
-						} catch (Throwable t) {
-							if (t instanceof OutOfMemoryError) t.printStackTrace();
-							System.out.println("Fatal error at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer);
-							throw t;
-						}
-					}
-				}
-			}
-			long end = System.currentTimeMillis();
-			System.out.println("\nsecs = " + ((end-start)/1000.0f));
-			System.out.println("queries/sec= " + 
-				(1.0f * runs * queries.length * analyzers.length * files.length 
-						/ ((end-start)/1000.0f)));
-			float mb = (1.0f * bytes * queries.length * runs) / (1024.0f * 1024.0f);
-			System.out.println("MB/sec = " + (mb / ((end-start)/1000.0f)));
-		}
-		
-		if (useMemIndex && useRAMIndex) 
-			System.out.println("No bug found. done.");
-		else 
-			System.out.println("Done benchmarking (without checking correctness).");
-	}
-	
-	// returns file line by line, ignoring empty lines and comments
-	private String[] readLines(File file) throws Exception {
-		BufferedReader reader = new BufferedReader(new InputStreamReader(
-				new FileInputStream(file))); 
-		ArrayList lines = new ArrayList();
-		String line;	
-		while ((line = reader.readLine()) != null) {
-			String t = line.trim(); 
-			if (t.length() > 0 && t.charAt(0) != '#' && (!t.startsWith("//"))) {
-				lines.add(line);
-			}
-		}
-		reader.close();
-		
-		String[] result = new String[lines.size()];
-		lines.toArray(result);
-		return result;
-	}
-	
-	private Document createDocument(String content) {
-		Document doc = new Document();
-		doc.add(new Field(FIELD_NAME, content, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS));
-		return doc;
-	}
-	
-	private MemoryIndex createMemoryIndex(Document doc) {
-		MemoryIndex index = new MemoryIndex();
-		Enumeration iter = doc.fields();
-		while (iter.hasMoreElements()) {
-			Field field = (Field) iter.nextElement();
-			index.addField(field.name(), field.stringValue(), analyzer);
-		}
-		return index;
-	}
-	
-	private RAMDirectory createRAMIndex(Document doc) {
-		RAMDirectory dir = new RAMDirectory();		
-		IndexWriter writer = null;
-		try {
-			writer = new IndexWriter(dir, analyzer, true);
-			writer.setMaxFieldLength(Integer.MAX_VALUE);
-			writer.addDocument(doc);
-			writer.optimize();
-			return dir;
-		} catch (IOException e) { // should never happen (RAMDirectory)
-			throw new RuntimeException(e);
-		} finally {
-			try {
-				if (writer != null) writer.close();
-			} catch (IOException e) { // should never happen (RAMDirectory)
-				throw new RuntimeException(e);
-			}
-		}
-	}
-		
-	private float query(Object index, Query query) {
-//		System.out.println("MB=" + (getMemorySize(index) / (1024.0f * 1024.0f)));
-		Searcher searcher = null;
-		try {
-			if (index instanceof Directory)
-				searcher = new IndexSearcher((Directory)index);
-			else 
-				searcher = ((MemoryIndex) index).createSearcher();
+//  public void setUp() {  }
+//  public void tearDown() {}
+  
+  public void testMany() throws Throwable {
+    String[] files = listFiles(new String[] {
+      "*.txt", "*.html", "*.xml", "xdocs/*.xml", 
+      "src/java/test/org/apache/lucene/queryParser/*.java",
+      "src/java/org/apache/lucene/index/memory/*.java",
+    });
+    System.out.println("files = " + java.util.Arrays.asList(files));
+    String[] xargs = new String[] {
+      "1", "1", "memram", 
+      "@src/test/org/apache/lucene/index/memory/testqueries.txt",
+    };
+    String[] args = new String[xargs.length + files.length];
+    System.arraycopy(xargs, 0, args, 0, xargs.length);
+    System.arraycopy(files, 0, args, xargs.length, files.length);
+    run(args);
+  }
+  
+  private void run(String[] args) throws Throwable {
+    int k = -1;
+    
+    int iters = 1;
+    if (args.length > ++k) iters = Math.max(1, Integer.parseInt(args[k]));
+    
+    int runs = 1;
+    if (args.length > ++k) runs = Math.max(1, Integer.parseInt(args[k]));
+    
+    String cmd = "memram";
+    if (args.length > ++k) cmd = args[k];
+    boolean useMemIndex = cmd.indexOf("mem") >= 0;
+    boolean useRAMIndex = cmd.indexOf("ram") >= 0;
+    
+    String[] queries = { "term", "term*", "term~", "Apache", "Apach~ AND Copy*" };
+    if (args.length > ++k) {
+      String arg = args[k];
+      if (arg.startsWith("@")) 
+        queries = readLines(new File(arg.substring(1)));
+      else
+        queries = new String[] { arg };
+    }
+    
+    File[] files = new File[] {new File("CHANGES.txt"), new File("LICENSE.txt") };
+    if (args.length > ++k) {
+      files = new File[args.length - k];
+      for (int i=k; i < args.length; i++) {
+        files[i-k] = new File(args[i]);
+      }
+    }
+    
+    boolean toLowerCase = true;
+//    boolean toLowerCase = false;
+//    Set stopWords = null;
+    Set stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
+    
+    Analyzer[] analyzers = new Analyzer[] { 
+        new SimpleAnalyzer(),
+        new StopAnalyzer(),
+        new StandardAnalyzer(),
+        PatternAnalyzer.DEFAULT_ANALYZER,
+//        new WhitespaceAnalyzer(),
+//        new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, false, null),
+//        new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, true, stopWords),        
+//        new SnowballAnalyzer("English", StopAnalyzer.ENGLISH_STOP_WORDS),
+    };
+    
+    for (int iter=0; iter < iters; iter++) {
+      System.out.println("\n########### iteration=" + iter);
+      long start = System.currentTimeMillis();            
+      long bytes = 0;
+      
+      for (int anal=0; anal < analyzers.length; anal++) {
+        this.analyzer = analyzers[anal];
+        
+        for (int i=0; i < files.length; i++) {
+          File file = files[i];
+          if (!file.exists() || file.isDirectory()) continue; // ignore
+          bytes += file.length();
+          String text = toString(new FileInputStream(file), null);
+          Document doc = createDocument(text);
+          System.out.println("\n*********** FILE=" + file);
+          
+          for (int q=0; q < queries.length; q++) {
+            try {
+              Query query = parseQuery(queries[q]);
+              
+              for (int run=0; run < runs; run++) {
+                float score1 = 0.0f; float score2 = 0.0f;
+                if (useMemIndex) score1 = query(createMemoryIndex(doc), query); 
+                if (useRAMIndex) score2 = query(createRAMIndex(doc), query);
+                if (useMemIndex && useRAMIndex) {
+                  System.out.println("diff="+ (score1-score2) + ", query=" + queries[q] + ", s1=" + score1 + ", s2=" + score2);
+                  if (score1 != score2 || score1 < 0.0f || score2 < 0.0f || score1 > 1.0f || score2 > 1.0f) {
+                    throw new IllegalStateException("BUG DETECTED:" + (i*(q+1)) + " at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer);
+                  }
+                }
+              }
+            } catch (Throwable t) {
+              if (t instanceof OutOfMemoryError) t.printStackTrace();
+              System.out.println("Fatal error at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer);
+              throw t;
+            }
+          }
+        }
+      }
+      long end = System.currentTimeMillis();
+      System.out.println("\nsecs = " + ((end-start)/1000.0f));
+      System.out.println("queries/sec= " + 
+        (1.0f * runs * queries.length * analyzers.length * files.length 
+            / ((end-start)/1000.0f)));
+      float mb = (1.0f * bytes * queries.length * runs) / (1024.0f * 1024.0f);
+      System.out.println("MB/sec = " + (mb / ((end-start)/1000.0f)));
+    }
+    
+    if (useMemIndex && useRAMIndex) 
+      System.out.println("No bug found. done.");
+    else 
+      System.out.println("Done benchmarking (without checking correctness).");
+  }
+  
+  // returns file line by line, ignoring empty lines and comments
+  private String[] readLines(File file) throws Exception {
+    BufferedReader reader = new BufferedReader(new InputStreamReader(
+        new FileInputStream(file))); 
+    ArrayList lines = new ArrayList();
+    String line;  
+    while ((line = reader.readLine()) != null) {
+      String t = line.trim(); 
+      if (t.length() > 0 && t.charAt(0) != '#' && (!t.startsWith("//"))) {
+        lines.add(line);
+      }
+    }
+    reader.close();
+    
+    String[] result = new String[lines.size()];
+    lines.toArray(result);
+    return result;
+  }
+  
+  private Document createDocument(String content) {
+    Document doc = new Document();
+    doc.add(new Field(FIELD_NAME, content, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS));
+    return doc;
+  }
+  
+  private MemoryIndex createMemoryIndex(Document doc) {
+    MemoryIndex index = new MemoryIndex();
+    Enumeration iter = doc.fields();
+    while (iter.hasMoreElements()) {
+      Field field = (Field) iter.nextElement();
+      index.addField(field.name(), field.stringValue(), analyzer);
+    }
+    return index;
+  }
+  
+  private RAMDirectory createRAMIndex(Document doc) {
+    RAMDirectory dir = new RAMDirectory();    
+    IndexWriter writer = null;
+    try {
+      writer = new IndexWriter(dir, analyzer, true);
+      writer.setMaxFieldLength(Integer.MAX_VALUE);
+      writer.addDocument(doc);
+      writer.optimize();
+      return dir;
+    } catch (IOException e) { // should never happen (RAMDirectory)
+      throw new RuntimeException(e);
+    } finally {
+      try {
+        if (writer != null) writer.close();
+      } catch (IOException e) { // should never happen (RAMDirectory)
+        throw new RuntimeException(e);
+      }
+    }
+  }
+    
+  private float query(Object index, Query query) {
+//    System.out.println("MB=" + (getMemorySize(index) / (1024.0f * 1024.0f)));
+    Searcher searcher = null;
+    try {
+      if (index instanceof Directory)
+        searcher = new IndexSearcher((Directory)index);
+      else 
+        searcher = ((MemoryIndex) index).createSearcher();

-			final float[] scores = new float[1]; // inits to 0.0f
-			searcher.search(query, new HitCollector() {
-				public void collect(int doc, float score) {
-					scores[0] = score;
-				}
-			});
-			float score = scores[0];
-//			Hits hits = searcher.search(query);
-//			float score = hits.length() > 0 ? hits.score(0) : 0.0f;
-			return score;
-		} catch (IOException e) { // should never happen (RAMDirectory)
-			throw new RuntimeException(e);
-		} finally {
-			try {
-				if (searcher != null) searcher.close();
-			} catch (IOException e) { // should never happen (RAMDirectory)
-				throw new RuntimeException(e);
-			}
-		}
-	}
-	
-	private int getMemorySize(Object index) {
-		if (index instanceof Directory) {
-			try {
-				Directory dir = (Directory) index;
-				int size = 0;
-				String[] fileNames = dir.list();
-				for (int i=0; i < fileNames.length; i++) {
-					size += dir.fileLength(fileNames[i]);
-				}
-				return size;
-			}
-			catch (IOException e) { // can never happen (RAMDirectory)
-				throw new RuntimeException(e);
-			}
-		}
-		else {
-			return ((MemoryIndex) index).getMemorySize();
-		}
-	}
-	
-	private Query parseQuery(String expression) throws ParseException {
-		QueryParser parser = new QueryParser(FIELD_NAME, analyzer);
-//		parser.setPhraseSlop(0);
-		return parser.parse(expression);
-	}
-	
-	/** returns all files matching the given file name patterns (quick n'dirty) */
-	static String[] listFiles(String[] fileNames) {
-		LinkedHashSet allFiles = new LinkedHashSet();
-		for (int i=0; i < fileNames.length; i++) {
-			int k;
-			if ((k = fileNames[i].indexOf("*")) < 0) {
-				allFiles.add(fileNames[i]);
-			} else {
-				String prefix = fileNames[i].substring(0, k);
-				if (prefix.length() == 0) prefix = ".";
-				final String suffix = fileNames[i].substring(k+1);
-				File[] files = new File(prefix).listFiles(new FilenameFilter() {
-					public boolean accept(File dir, String name) {
-						return name.endsWith(suffix);
-					}
-				});
-				if (files != null) {
-					for (int j=0; j < files.length; j++) {
-						allFiles.add(files[j].getPath());
-					}
-				}
-			}			
-		}
-		
-		String[] result = new String[allFiles.size()];
-		allFiles.toArray(result);
-		return result;
-	}
-	
-	// trick to detect default platform charset
-	private static final Charset DEFAULT_PLATFORM_CHARSET = 
-		Charset.forName(new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding());	
-	
-	// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
-	private static String toString(InputStream input, Charset charset) throws IOException {
-		if (charset == null) charset = DEFAULT_PLATFORM_CHARSET;			
-		byte[] data = toByteArray(input);
-		return charset.decode(ByteBuffer.wrap(data)).toString();
-	}
-	
-	private static byte[] toByteArray(InputStream input) throws IOException {
-		try {
-			// safe and fast even if input.available() behaves weird or buggy
-			int len = Math.max(256, input.available());
-			byte[] buffer = new byte[len];
-			byte[] output = new byte[len];
-			
-			len = 0;
-			int n;
-			while ((n = input.read(buffer)) >= 0) {
-				if (len + n > output.length) { // grow capacity
-					byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
-					System.arraycopy(output, 0, tmp, 0, len);
-					System.arraycopy(buffer, 0, tmp, len, n);
-					buffer = output; // use larger buffer for future larger bulk reads
-					output = tmp;
-				} else {
-					System.arraycopy(buffer, 0, output, len, n);
-				}
-				len += n;
-			}
+      final float[] scores = new float[1]; // inits to 0.0f
+      searcher.search(query, new HitCollector() {
+        public void collect(int doc, float score) {
+          scores[0] = score;
+        }
+      });
+      float score = scores[0];
+//      Hits hits = searcher.search(query);
+//      float score = hits.length() > 0 ? hits.score(0) : 0.0f;
+      return score;
+    } catch (IOException e) { // should never happen (RAMDirectory)
+      throw new RuntimeException(e);
+    } finally {
+      try {
+        if (searcher != null) searcher.close();
+      } catch (IOException e) { // should never happen (RAMDirectory)
+        throw new RuntimeException(e);
+      }
+    }
+  }
+  
+  private int getMemorySize(Object index) {
+    if (index instanceof Directory) {
+      try {
+        Directory dir = (Directory) index;
+        int size = 0;
+        String[] fileNames = dir.list();
+        for (int i=0; i < fileNames.length; i++) {
+          size += dir.fileLength(fileNames[i]);
+        }
+        return size;
+      }
+      catch (IOException e) { // can never happen (RAMDirectory)
+        throw new RuntimeException(e);
+      }
+    }
+    else {
+      return ((MemoryIndex) index).getMemorySize();
+    }
+  }
+  
+  private Query parseQuery(String expression) throws ParseException {
+    QueryParser parser = new QueryParser(FIELD_NAME, analyzer);
+//    parser.setPhraseSlop(0);
+    return parser.parse(expression);
+  }
+  
+  /** returns all files matching the given file name patterns (quick n'dirty) */
+  static String[] listFiles(String[] fileNames) {
+    LinkedHashSet allFiles = new LinkedHashSet();
+    for (int i=0; i < fileNames.length; i++) {
+      int k;
+      if ((k = fileNames[i].indexOf("*")) < 0) {
+        allFiles.add(fileNames[i]);
+      } else {
+        String prefix = fileNames[i].substring(0, k);
+        if (prefix.length() == 0) prefix = ".";
+        final String suffix = fileNames[i].substring(k+1);
+        File[] files = new File(prefix).listFiles(new FilenameFilter() {
+          public boolean accept(File dir, String name) {
+            return name.endsWith(suffix);
+          }
+        });
+        if (files != null) {
+          for (int j=0; j < files.length; j++) {
+            allFiles.add(files[j].getPath());
+          }
+        }
+      }      
+    }
+    
+    String[] result = new String[allFiles.size()];
+    allFiles.toArray(result);
+    return result;
+  }
+  
+  // trick to detect default platform charset
+  private static final Charset DEFAULT_PLATFORM_CHARSET = 
+    Charset.forName(new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding());  
+  
+  // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
+  private static String toString(InputStream input, Charset charset) throws IOException {
+    if (charset == null) charset = DEFAULT_PLATFORM_CHARSET;      
+    byte[] data = toByteArray(input);
+    return charset.decode(ByteBuffer.wrap(data)).toString();
+  }
+  
+  private static byte[] toByteArray(InputStream input) throws IOException {
+    try {
+      // safe and fast even if input.available() behaves weird or buggy
+      int len = Math.max(256, input.available());
+      byte[] buffer = new byte[len];
+      byte[] output = new byte[len];
+      
+      len = 0;
+      int n;
+      while ((n = input.read(buffer)) >= 0) {
+        if (len + n > output.length) { // grow capacity
+          byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
+          System.arraycopy(output, 0, tmp, 0, len);
+          System.arraycopy(buffer, 0, tmp, len, n);
+          buffer = output; // use larger buffer for future larger bulk reads
+          output = tmp;
+        } else {
+          System.arraycopy(buffer, 0, output, len, n);
+        }
+        len += n;
+      }

-			if (len == output.length) return output;
-			buffer = null; // help gc
-			buffer = new byte[len];
-			System.arraycopy(output, 0, buffer, 0, len);
-			return buffer;
-		} finally {
-			if (input != null) input.close();
-		}
-	}
-	
+      if (len == output.length) return output;
+      buffer = null; // help gc
+      buffer = new byte[len];
+      System.arraycopy(output, 0, buffer, 0, len);
+      return buffer;
+    } finally {
+      if (input != null) input.close();
+    }
+  }
+  
 }
--- a/contrib/memory/src/test/org/apache/lucene/index/memory/PatternAnalyzerTest.java
+++ b/contrib/memory/src/test/org/apache/lucene/index/memory/PatternAnalyzerTest.java
@ -60,220 +60,220 @@ silently truncates text, and so the comparison results in assertEquals() don't m
@author whoschek.AT.lbl.DOT.gov
 */
 public class PatternAnalyzerTest extends TestCase {
-	
-	/** Runs the tests and/or benchmark */
-	public static void main(String[] args) throws Throwable {
-		new PatternAnalyzerTest().run(args);		
-	}
-	
-	public void testMany() throws Throwable {
-		String[] files = MemoryIndexTest.listFiles(new String[] {
-			"*.txt", "*.html", "*.xml", "xdocs/*.xml", 
-			"src/test/org/apache/lucene/queryParser/*.java",
-			"src/org/apache/lucene/index/memory/*.java",
-		});
-		System.out.println("files = " + java.util.Arrays.asList(files));
-		String[] xargs = new String[] {
-			"1", "1", "patluc", "1", "2", "2",
-		};
-		String[] args = new String[xargs.length + files.length];
-		System.arraycopy(xargs, 0, args, 0, xargs.length);
-		System.arraycopy(files, 0, args, xargs.length, files.length);
-		run(args);
-	}
-	
-	private void run(String[] args) throws Throwable {
-		int k = -1;
-		
-		int iters = 1;
-		if (args.length > ++k) iters = Math.max(1, Integer.parseInt(args[k]));
-		
-		int runs = 1;
-		if (args.length > ++k) runs = Math.max(1, Integer.parseInt(args[k]));
-		
-		String cmd = "patluc";
-		if (args.length > ++k) cmd = args[k];
-		boolean usePattern = cmd.indexOf("pat") >= 0;
-		boolean useLucene  = cmd.indexOf("luc") >= 0;
-		
-		int maxLetters = 1; // = 2: CharTokenizer.MAX_WORD_LEN issue; see class javadoc
-		if (args.length > ++k) maxLetters = Integer.parseInt(args[k]);
-		
-		int maxToLower = 2;
-		if (args.length > ++k) maxToLower = Integer.parseInt(args[k]);
+  
+  /** Runs the tests and/or benchmark */
+  public static void main(String[] args) throws Throwable {
+    new PatternAnalyzerTest().run(args);    
+  }
+  
+  public void testMany() throws Throwable {
+    String[] files = MemoryIndexTest.listFiles(new String[] {
+      "*.txt", "*.html", "*.xml", "xdocs/*.xml", 
+      "src/test/org/apache/lucene/queryParser/*.java",
+      "src/org/apache/lucene/index/memory/*.java",
+    });
+    System.out.println("files = " + java.util.Arrays.asList(files));
+    String[] xargs = new String[] {
+      "1", "1", "patluc", "1", "2", "2",
+    };
+    String[] args = new String[xargs.length + files.length];
+    System.arraycopy(xargs, 0, args, 0, xargs.length);
+    System.arraycopy(files, 0, args, xargs.length, files.length);
+    run(args);
+  }
+  
+  private void run(String[] args) throws Throwable {
+    int k = -1;
+    
+    int iters = 1;
+    if (args.length > ++k) iters = Math.max(1, Integer.parseInt(args[k]));
+    
+    int runs = 1;
+    if (args.length > ++k) runs = Math.max(1, Integer.parseInt(args[k]));
+    
+    String cmd = "patluc";
+    if (args.length > ++k) cmd = args[k];
+    boolean usePattern = cmd.indexOf("pat") >= 0;
+    boolean useLucene  = cmd.indexOf("luc") >= 0;
+    
+    int maxLetters = 1; // = 2: CharTokenizer.MAX_WORD_LEN issue; see class javadoc
+    if (args.length > ++k) maxLetters = Integer.parseInt(args[k]);
+    
+    int maxToLower = 2;
+    if (args.length > ++k) maxToLower = Integer.parseInt(args[k]);

-		int maxStops = 2;
-		if (args.length > ++k) maxStops = Integer.parseInt(args[k]);
-		
-		File[] files = new File[] {new File("CHANGES.txt"), new File("LICENSE.txt") };
-		if (args.length > ++k) {
-			files = new File[args.length - k];
-			for (int i=k; i < args.length; i++) {
-				files[i-k] = new File(args[i]);
-			}
-		}
-		
-		for (int iter=0; iter < iters; iter++) {
-			System.out.println("\n########### iteration=" + iter);
-			long start = System.currentTimeMillis();						
-			long bytes = 0;
-			
-			for (int i=0; i < files.length; i++) {
-				File file = files[i];
-				if (!file.exists() || file.isDirectory()) continue; // ignore
-				bytes += file.length();
-				String text = toString(new FileInputStream(file), null);
-				System.out.println("\n*********** FILE=" + file);
+    int maxStops = 2;
+    if (args.length > ++k) maxStops = Integer.parseInt(args[k]);
+    
+    File[] files = new File[] {new File("CHANGES.txt"), new File("LICENSE.txt") };
+    if (args.length > ++k) {
+      files = new File[args.length - k];
+      for (int i=k; i < args.length; i++) {
+        files[i-k] = new File(args[i]);
+      }
+    }
+    
+    for (int iter=0; iter < iters; iter++) {
+      System.out.println("\n########### iteration=" + iter);
+      long start = System.currentTimeMillis();            
+      long bytes = 0;
+      
+      for (int i=0; i < files.length; i++) {
+        File file = files[i];
+        if (!file.exists() || file.isDirectory()) continue; // ignore
+        bytes += file.length();
+        String text = toString(new FileInputStream(file), null);
+        System.out.println("\n*********** FILE=" + file);

-				for (int letters=0; letters < maxLetters; letters++) {
-					boolean lettersOnly = letters == 0;
-					
-					for (int stops=0; stops < maxStops; stops++) {
-						Set stopWords = null;
-						if (stops != 0) stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
-								
-						for (int toLower=0; toLower < maxToLower; toLower++) {
-							boolean toLowerCase = toLower != 0;
-								
-							for (int run=0; run < runs; run++) {
-								List tokens1 = null; List tokens2 = null;
-								try {
-									if (usePattern) tokens1 = getTokens(patternTokenStream(text, lettersOnly, toLowerCase, stopWords));
-									if (useLucene) tokens2 = getTokens(luceneTokenStream(text, lettersOnly, toLowerCase, stopWords));					
-									if (usePattern && useLucene) assertEquals(tokens1, tokens2);
-								} catch (Throwable t) {
-									if (t instanceof OutOfMemoryError) t.printStackTrace();
-									System.out.println("fatal error at file=" + file + ", letters="+ lettersOnly + ", toLowerCase=" + toLowerCase + ", stopwords=" + (stopWords != null ? "english" : "none"));
-									System.out.println("\n\ntokens1=" + toString(tokens1));
-									System.out.println("\n\ntokens2=" + toString(tokens2));
-									throw t;
-								}
-							}
-						}
-					}
-				}
-				long end = System.currentTimeMillis();
-				System.out.println("\nsecs = " + ((end-start)/1000.0f));
-				System.out.println("files/sec= " + 
-						(1.0f * runs * maxLetters * maxToLower * maxStops * files.length 
-						/ ((end-start)/1000.0f)));
-				float mb = (1.0f * bytes * runs * maxLetters * maxToLower * maxStops) / (1024.0f * 1024.0f);
-				System.out.println("MB/sec = " + (mb / ((end-start)/1000.0f)));
-			}
-		}
-		
-		if (usePattern && useLucene) 
-			System.out.println("No bug found. done.");
-		else 
-			System.out.println("Done benchmarking (without checking correctness).");
-	}
+        for (int letters=0; letters < maxLetters; letters++) {
+          boolean lettersOnly = letters == 0;
+          
+          for (int stops=0; stops < maxStops; stops++) {
+            Set stopWords = null;
+            if (stops != 0) stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
+                
+            for (int toLower=0; toLower < maxToLower; toLower++) {
+              boolean toLowerCase = toLower != 0;
+                
+              for (int run=0; run < runs; run++) {
+                List tokens1 = null; List tokens2 = null;
+                try {
+                  if (usePattern) tokens1 = getTokens(patternTokenStream(text, lettersOnly, toLowerCase, stopWords));
+                  if (useLucene) tokens2 = getTokens(luceneTokenStream(text, lettersOnly, toLowerCase, stopWords));          
+                  if (usePattern && useLucene) assertEquals(tokens1, tokens2);
+                } catch (Throwable t) {
+                  if (t instanceof OutOfMemoryError) t.printStackTrace();
+                  System.out.println("fatal error at file=" + file + ", letters="+ lettersOnly + ", toLowerCase=" + toLowerCase + ", stopwords=" + (stopWords != null ? "english" : "none"));
+                  System.out.println("\n\ntokens1=" + toString(tokens1));
+                  System.out.println("\n\ntokens2=" + toString(tokens2));
+                  throw t;
+                }
+              }
+            }
+          }
+        }
+        long end = System.currentTimeMillis();
+        System.out.println("\nsecs = " + ((end-start)/1000.0f));
+        System.out.println("files/sec= " + 
+            (1.0f * runs * maxLetters * maxToLower * maxStops * files.length 
+            / ((end-start)/1000.0f)));
+        float mb = (1.0f * bytes * runs * maxLetters * maxToLower * maxStops) / (1024.0f * 1024.0f);
+        System.out.println("MB/sec = " + (mb / ((end-start)/1000.0f)));
+      }
+    }
+    
+    if (usePattern && useLucene) 
+      System.out.println("No bug found. done.");
+    else 
+      System.out.println("Done benchmarking (without checking correctness).");
+  }

-	private TokenStream patternTokenStream(String text, boolean letters, boolean toLowerCase, Set stopWords) {
-		Pattern pattern;
-		if (letters) 
-			pattern = PatternAnalyzer.NON_WORD_PATTERN;
-		else 							
-			pattern = PatternAnalyzer.WHITESPACE_PATTERN;
-		PatternAnalyzer analyzer = new PatternAnalyzer(pattern, toLowerCase, stopWords);
-		return analyzer.tokenStream("", text);
-	}
-	
-	private TokenStream luceneTokenStream(String text, boolean letters, boolean toLowerCase, Set stopWords) {
-		TokenStream stream;
-		if (letters) 
-			stream = new LetterTokenizer(new StringReader(text));
-		else
-			stream = new WhitespaceTokenizer(new StringReader(text));
-		if (toLowerCase)	stream = new LowerCaseFilter(stream);
-		if (stopWords != null) stream = new StopFilter(stream, stopWords);
-		return stream;						
-	}
-	
-	private List getTokens(TokenStream stream) throws IOException {
-		ArrayList tokens = new ArrayList();
-		Token token;
-		while ((token = stream.next()) != null) {
-			tokens.add(token);
-		}
-		return tokens;
-	}
-	
-	private void assertEquals(List tokens1, List tokens2) {
-		int size = Math.min(tokens1.size(), tokens2.size());
-		int i=0;
-		try {
-			for (; i < size; i++) {
-				Token t1 = (Token) tokens1.get(i);
-				Token t2 = (Token) tokens2.get(i);
-				if (!(t1.termText().equals(t2.termText()))) throw new IllegalStateException("termText");
-				if (t1.startOffset() != t2.startOffset()) throw new IllegalStateException("startOffset");
-				if (t1.endOffset() != t2.endOffset()) throw new IllegalStateException("endOffset");
-				if (!(t1.type().equals(t2.type()))) throw new IllegalStateException("type");
-			}
-			if (tokens1.size() != tokens2.size()) 	throw new IllegalStateException("size1=" + tokens1.size() + ", size2=" + tokens2.size());
-		}
+  private TokenStream patternTokenStream(String text, boolean letters, boolean toLowerCase, Set stopWords) {
+    Pattern pattern;
+    if (letters) 
+      pattern = PatternAnalyzer.NON_WORD_PATTERN;
+    else               
+      pattern = PatternAnalyzer.WHITESPACE_PATTERN;
+    PatternAnalyzer analyzer = new PatternAnalyzer(pattern, toLowerCase, stopWords);
+    return analyzer.tokenStream("", text);
+  }
+  
+  private TokenStream luceneTokenStream(String text, boolean letters, boolean toLowerCase, Set stopWords) {
+    TokenStream stream;
+    if (letters) 
+      stream = new LetterTokenizer(new StringReader(text));
+    else
+      stream = new WhitespaceTokenizer(new StringReader(text));
+    if (toLowerCase)  stream = new LowerCaseFilter(stream);
+    if (stopWords != null) stream = new StopFilter(stream, stopWords);
+    return stream;            
+  }
+  
+  private List getTokens(TokenStream stream) throws IOException {
+    ArrayList tokens = new ArrayList();
+    Token token;
+    while ((token = stream.next()) != null) {
+      tokens.add(token);
+    }
+    return tokens;
+  }
+  
+  private void assertEquals(List tokens1, List tokens2) {
+    int size = Math.min(tokens1.size(), tokens2.size());
+    int i=0;
+    try {
+      for (; i < size; i++) {
+        Token t1 = (Token) tokens1.get(i);
+        Token t2 = (Token) tokens2.get(i);
+        if (!(t1.termText().equals(t2.termText()))) throw new IllegalStateException("termText");
+        if (t1.startOffset() != t2.startOffset()) throw new IllegalStateException("startOffset");
+        if (t1.endOffset() != t2.endOffset()) throw new IllegalStateException("endOffset");
+        if (!(t1.type().equals(t2.type()))) throw new IllegalStateException("type");
+      }
+      if (tokens1.size() != tokens2.size())   throw new IllegalStateException("size1=" + tokens1.size() + ", size2=" + tokens2.size());
+    }

-		catch (IllegalStateException e) {
-			if (size > 0) {
-				System.out.println("i=" + i + ", size=" + size);
-				System.out.println("t1[size]='" + ((Token) tokens1.get(size-1)).termText() + "'");
-				System.out.println("t2[size]='" + ((Token) tokens2.get(size-1)).termText() + "'");
-			}
-			throw e;
-		}
-	}
-	
-	private String toString(List tokens) {
-		if (tokens == null) return "null";
-		String str = "[";
-		for (int i=0; i < tokens.size(); i++) {
-			Token t1 = (Token) tokens.get(i);
-			str = str + "'" + t1.termText() + "', ";
-		}
-		return str + "]";
-	}
-	
-	// trick to detect default platform charset
-	private static final Charset DEFAULT_PLATFORM_CHARSET = 
-		Charset.forName(new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding());	
-	
-	// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
-	private static String toString(InputStream input, Charset charset) throws IOException {
-		if (charset == null) charset = DEFAULT_PLATFORM_CHARSET;			
-		byte[] data = toByteArray(input);
-		return charset.decode(ByteBuffer.wrap(data)).toString();
-	}
-	
-	private static byte[] toByteArray(InputStream input) throws IOException {
-		try {
-			// safe and fast even if input.available() behaves weird or buggy
-			int len = Math.max(256, input.available());
-			byte[] buffer = new byte[len];
-			byte[] output = new byte[len];
-			
-			len = 0;
-			int n;
-			while ((n = input.read(buffer)) >= 0) {
-				if (len + n > output.length) { // grow capacity
-					byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
-					System.arraycopy(output, 0, tmp, 0, len);
-					System.arraycopy(buffer, 0, tmp, len, n);
-					buffer = output; // use larger buffer for future larger bulk reads
-					output = tmp;
-				} else {
-					System.arraycopy(buffer, 0, output, len, n);
-				}
-				len += n;
-			}
+    catch (IllegalStateException e) {
+      if (size > 0) {
+        System.out.println("i=" + i + ", size=" + size);
+        System.out.println("t1[size]='" + ((Token) tokens1.get(size-1)).termText() + "'");
+        System.out.println("t2[size]='" + ((Token) tokens2.get(size-1)).termText() + "'");
+      }
+      throw e;
+    }
+  }
+  
+  private String toString(List tokens) {
+    if (tokens == null) return "null";
+    String str = "[";
+    for (int i=0; i < tokens.size(); i++) {
+      Token t1 = (Token) tokens.get(i);
+      str = str + "'" + t1.termText() + "', ";
+    }
+    return str + "]";
+  }
+  
+  // trick to detect default platform charset
+  private static final Charset DEFAULT_PLATFORM_CHARSET = 
+    Charset.forName(new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding());  
+  
+  // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
+  private static String toString(InputStream input, Charset charset) throws IOException {
+    if (charset == null) charset = DEFAULT_PLATFORM_CHARSET;      
+    byte[] data = toByteArray(input);
+    return charset.decode(ByteBuffer.wrap(data)).toString();
+  }
+  
+  private static byte[] toByteArray(InputStream input) throws IOException {
+    try {
+      // safe and fast even if input.available() behaves weird or buggy
+      int len = Math.max(256, input.available());
+      byte[] buffer = new byte[len];
+      byte[] output = new byte[len];
+      
+      len = 0;
+      int n;
+      while ((n = input.read(buffer)) >= 0) {
+        if (len + n > output.length) { // grow capacity
+          byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
+          System.arraycopy(output, 0, tmp, 0, len);
+          System.arraycopy(buffer, 0, tmp, len, n);
+          buffer = output; // use larger buffer for future larger bulk reads
+          output = tmp;
+        } else {
+          System.arraycopy(buffer, 0, output, len, n);
+        }
+        len += n;
+      }

-			if (len == output.length) return output;
-			buffer = null; // help gc
-			buffer = new byte[len];
-			System.arraycopy(output, 0, buffer, 0, len);
-			return buffer;
-		} finally {
-			if (input != null) input.close();
-		}
-	}
-	
+      if (len == output.length) return output;
+      buffer = null; // help gc
+      buffer = new byte[len];
+      System.arraycopy(output, 0, buffer, 0, len);
+      return buffer;
+    } finally {
+      if (input != null) input.close();
+    }
+  }
+  
 }