git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@351896 13f79535-47bb-0310-9956-ffa450edef68

2005-12-03 05:44:16 +00:00 · 2005-12-03 05:44:16 +00:00 · ebe44ace90
parent a155416b4d
commit ebe44ace90
2 changed files with 529 additions and 0 deletions
--- a/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymMap.java
+++ b/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymMap.java
@ -0,0 +1,395 @@
+package org.apache.lucene.index.memory;
+
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+/**
+ * Loads the <a target="_blank" 
+ * href="http://www.cogsci.princeton.edu/~wn/">WordNet </a> prolog file <a
+ * href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">wn_s.pl </a>
+ * into a thread-safe main-memory hash map that can be used for fast
+ * high-frequncy lookups of synonyms for any given (lowercase) word string.
+ * <p>
+ * There holds: If B is a synonym for A (A -> B) then A is also a synonym for B (B -> A).
+ * There does not necessary hold: A -> B, B -> C then A -> C.
+ * <p>
+ * Loading typically takes some 1.5 secs, so should be done only once per
+ * (server) program execution, using a singleton pattern. Once loaded, a
+ * synonym lookup via {@link #getSynonyms(String)}takes constant time O(1).
+ * A loaded default synonym map consumes about 10 MB main memory.
+ * An instance is immutable, hence thread-safe.
+ * <p>
+ * This implementation borrows some ideas from the Lucene Syns2Index demo that 
+ * Dave Spencer originally contributed to Lucene. Dave's approach
+ * involved a persistent Lucene index which is suitable for occasional
+ * lookups or very large synonym tables, but considered unsuitable for 
+ * high-frequency lookups of medium size synonym tables.
+ * <p>
+ * Example Usage:
+ * <pre>
+ * String[] words = new String[] { "hard", "woods", "forest", "wolfish", "xxxx"};
+ * SynonymMap map = new SynonymMap(new FileInputStream("samples/fulltext/wn_s.pl"));
+ * for (int i = 0; i &lt; words.length; i++) {
+ *     String[] synonyms = map.getSynonyms(words[i]);
+ *     System.out.println(words[i] + ":" + java.util.Arrays.asList(synonyms).toString());
+ * }
+ * 
+ * Example output:
+ * hard:[arduous, backbreaking, difficult, fermented, firmly, grueling, gruelling, heavily, heavy, intemperately, knockout, laborious, punishing, severe, severely, strong, toilsome, tough]
+ * woods:[forest, wood]
+ * forest:[afforest, timber, timberland, wood, woodland, woods]
+ * wolfish:[edacious, esurient, rapacious, ravening, ravenous, voracious, wolflike]
+ * xxxx:[]
+ * </pre>
+ * 
+ * @author whoschek.AT.lbl.DOT.gov
+ * @see <a target="_blank"
+ *      href="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html">prologdb
+ *      man page </a>
+ * @see <a target="_blank" href="http://www.hostmon.com/rfc/advanced.jsp">Dave's synonym demo site</a>
+ */
+public class SynonymMap {
+
+	/** the index data; Map<String word, String[] synonyms> */
+	private final HashMap table;
+	
+	private static final String[] EMPTY = new String[0];
+	
+	private static final boolean DEBUG = false;
+
+	/**
+	 * Constructs an instance, loading WordNet synonym data from the given input
+	 * stream. Finally closes the stream. The words in the stream must be in
+	 * UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.).
+	 * 
+	 * @param input
+	 *            the stream to read from (null indicates an empty synonym map)
+	 * @throws IOException
+	 *             if an error occured while reading the stream.
+	 */
+	public SynonymMap(InputStream input) throws IOException {
+		this.table = input == null ? new HashMap(0) : read(toByteArray(input));
+	}
+	
+	/**
+	 * Returns the synonym set for the given word, sorted ascending.
+	 * 
+	 * @param word
+	 *            the word to lookup (must be in lowercase).
+	 * @return the synonyms; a set of zero or more words, sorted ascending, each
+	 *         word containing lowercase characters that satisfy
+	 *         <code>Character.isLetter()</code>.
+	 */
+	public String[] getSynonyms(String word) {
+		Object syns = table.get(word);
+		if (syns == null) return EMPTY;
+		if (syns instanceof String) return new String[] {(String) syns};
+		
+		String[] synonyms = (String[]) syns;
+		String[] copy = new String[synonyms.length]; // copy for guaranteed immutability
+		System.arraycopy(synonyms, 0, copy, 0, synonyms.length);
+		return copy;
+	}
+	
+	/** Returns a String representation of the index data for debugging purposes. */
+	public String toString() {
+		StringBuffer buf = new StringBuffer();
+		Iterator iter = new TreeMap(table).keySet().iterator();
+		int count = 0;
+		int f0 = 0;
+		int f1 = 0;
+		int f2 = 0;
+		int f3 = 0;
+		
+		while (iter.hasNext()) {
+			String word = (String) iter.next();
+			buf.append(word + ":");
+			String[] synonyms = getSynonyms(word);
+			buf.append(Arrays.asList(synonyms));
+			buf.append("\n");
+			count += synonyms.length;
+			if (synonyms.length == 0) f0++;
+			if (synonyms.length == 1) f1++;
+			if (synonyms.length == 2) f2++;
+			if (synonyms.length == 3) f3++;
+		}
+		
+		buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3);
+		return buf.toString();
+	}
+	
+	/**
+	 * Analyzes/transforms the given word on input stream loading. This default implementation simply
+	 * lowercases the word. Override this method with a custom stemming
+	 * algorithm or similar, if desired.
+	 * 
+	 * @param word
+	 *            the word to analyze
+	 * @return the same word, or a different word (or null to indicate that the
+	 *         word should be ignored)
+	 */
+	protected String analyze(String word) {
+		return word.toLowerCase();
+	}
+
+	private static boolean isValid(String str) {
+		for (int i=str.length(); --i >= 0; ) {
+			if (!Character.isLetter(str.charAt(i))) return false;
+		}
+		return true;
+	}
+
+	private HashMap read(byte[] data) {
+		int WORDS  = (int) (76401 / 0.7); // presizing
+		int GROUPS = (int) (88022 / 0.7); // presizing
+		HashMap word2Groups = new HashMap(WORDS);  // Map<String word, int[] groups>
+		HashMap group2Words = new HashMap(GROUPS); // Map<int group, String[] words>
+		HashMap internedWords = new HashMap(WORDS);// Map<String word, String word>
+
+		Charset charset = Charset.forName("UTF-8");
+		int lastNum = -1;
+		Integer lastGroup = null;
+		int len = data.length;
+		int i=0;
+		
+		while (i < len) { // until EOF
+			/* Part A: Parse a line */
+			
+			// scan to beginning of group
+			while (i < len && data[i] != '(') i++;
+			if (i >= len) break; // EOF
+			i++;
+			
+			// parse group
+			int num = 0;
+			while (i < len && data[i] != ',') {
+				num = 10*num + (data[i] - 48);
+				i++;
+			}
+			i++;
+//			if (DEBUG) System.err.println("num="+ num);
+			
+			// scan to beginning of word
+			while (i < len && data[i] != '\'') i++;
+			i++;
+	
+			// scan to end of word
+			int start = i;
+			do {
+				while (i < len && data[i] != '\'') i++;
+				i++;
+			} while (i < len && data[i] != ','); // word must end with "',"
+			
+			if (i >= len) break; // EOF
+			String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString();
+//			String word = new String(data, 0, start, i-start-1); // ASCII
+			
+			/*
+			 * Part B: ignore phrases (with spaces and hyphens) and
+			 * non-alphabetic words, and let user customize word (e.g. do some
+			 * stemming)
+			 */
+			if (!isValid(word)) continue; // ignore
+			word = analyze(word);
+			if (word == null || word.length() == 0) continue; // ignore
+			
+			
+			/* Part C: Add (group,word) to tables */
+			
+			// ensure compact string representation, minimizing memory overhead
+			String w = (String) internedWords.get(word);
+			if (w == null) {
+				word = new String(word); // ensure compact string
+				internedWords.put(word, word);
+			} else {
+				word = w;
+			}
+			
+			Integer group = lastGroup;
+			if (num != lastNum) {
+				group = new Integer(num);
+				lastGroup = group;
+				lastNum = num;
+			}
+			
+			// add word --> group
+			ArrayList groups = (ArrayList) word2Groups.get(word);
+			if (groups == null) {
+				groups = new ArrayList(1);
+				word2Groups.put(word, groups);
+			}
+			groups.add(group);
+
+			// add group --> word
+			ArrayList words = (ArrayList) group2Words.get(group);
+			if (words == null) {
+				words = new ArrayList(1);
+				group2Words.put(group, words);
+			} 
+			words.add(word);
+		}
+		
+		
+		/* Part D: compute index data structure */
+		HashMap word2Syns = createIndex(word2Groups, group2Words);		
+				
+		/* Part E: minimize memory consumption by a factor 3 (or so) */
+//		if (true) return word2Syns;
+		word2Groups = null; // help gc
+		group2Words = null; // help gc		
+		return optimize(word2Syns, internedWords);
+	}
+	
+	private HashMap createIndex(Map word2Groups, Map group2Words) {
+		HashMap word2Syns = new HashMap();
+		Iterator iter = word2Groups.entrySet().iterator();
+		
+		while (iter.hasNext()) { // for each word
+			Map.Entry entry = (Map.Entry) iter.next();
+			ArrayList group = (ArrayList) entry.getValue();			
+			String word = (String) entry.getKey();
+			
+//			HashSet synonyms = new HashSet();
+			TreeSet synonyms = new TreeSet();
+			for (int i=group.size(); --i >= 0; ) { // for each groupID of word
+				ArrayList words = (ArrayList) group2Words.get(group.get(i));
+				for (int j=words.size(); --j >= 0; ) { // add all words				
+					Object synonym = words.get(j); // note that w and word are interned
+					if (synonym != word) { // a word is implicitly it's own synonym
+						synonyms.add(synonym);
+					}
+				}
+			}
+
+			int size = synonyms.size();
+			if (size > 0) {
+				String[] syns = new String[size];
+				if (size == 1)  
+					syns[0] = (String) synonyms.first();
+				else
+					synonyms.toArray(syns);
+//				if (syns.length > 1) Arrays.sort(syns);
+//				if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns));
+				word2Syns.put(word, syns);
+			}
+		}
+	
+		return word2Syns;
+	}
+
+	private HashMap optimize(HashMap word2Syns, HashMap internedWords) {
+		if (DEBUG) {
+			System.err.println("before gc");
+			for (int i=0; i < 10; i++) System.gc();
+			System.err.println("after gc");
+		}
+		
+		// collect entries
+		int len = 0;
+		int size = word2Syns.size();
+		String[][] allSynonyms = new String[size][];
+		String[] words = new String[size];
+		Iterator iter = word2Syns.entrySet().iterator();
+		for (int j=0; j < size; j++) {
+			Map.Entry entry = (Map.Entry) iter.next();
+			allSynonyms[j] = (String[]) entry.getValue(); 
+			words[j] = (String) entry.getKey();
+			len += words[j].length();
+		}
+		
+		// assemble large string containing all words
+		StringBuffer buf = new StringBuffer(len);
+		for (int j=0; j < size; j++) buf.append(words[j]);
+		String allWords = new String(buf.toString()); // ensure compact string across JDK versions
+		buf = null;
+		
+		// intern words at app level via memory-overlaid substrings
+		for (int p=0, j=0; j < size; j++) {
+			String word = words[j];
+			internedWords.put(word, allWords.substring(p, p + word.length()));
+			p += word.length();
+		}
+		
+		// replace words with interned words
+		for (int j=0; j < size; j++) {
+			String[] syns = allSynonyms[j];
+			for (int k=syns.length; --k >= 0; ) {
+				syns[k] = (String) internedWords.get(syns[k]);
+			}
+			Object replacement = syns;
+			if (syns.length == 1) replacement = syns[0]; // minimize memory consumption some more
+			word2Syns.remove(words[j]);
+			word2Syns.put(internedWords.get(words[j]), replacement);
+		}
+		
+		if (DEBUG) {
+			words = null;
+			allSynonyms = null;
+			internedWords = null;
+			allWords = null;
+			System.err.println("before gc");
+			for (int i=0; i < 10; i++) System.gc();
+			System.err.println("after gc");
+		}
+		return word2Syns;
+	}
+	
+	// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
+	private static byte[] toByteArray(InputStream input) throws IOException {
+		try {
+			// safe and fast even if input.available() behaves weird or buggy
+			int len = Math.max(256, input.available());
+			byte[] buffer = new byte[len];
+			byte[] output = new byte[len];
+			
+			len = 0;
+			int n;
+			while ((n = input.read(buffer)) >= 0) {
+				if (len + n > output.length) { // grow capacity
+					byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
+					System.arraycopy(output, 0, tmp, 0, len);
+					System.arraycopy(buffer, 0, tmp, len, n);
+					buffer = output; // use larger buffer for future larger bulk reads
+					output = tmp;
+				} else {
+					System.arraycopy(buffer, 0, output, len, n);
+				}
+				len += n;
+			}
+
+			if (len == output.length) return output;
+			buffer = null; // help gc
+			buffer = new byte[len];
+			System.arraycopy(output, 0, buffer, 0, len);
+			return buffer;
+		} finally {
+			if (input != null) input.close();
+		}
+	}
+	
+}
--- a/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java
+++ b/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java
@ -0,0 +1,134 @@
+package org.apache.lucene.index.memory;
+
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Injects additional tokens for synonyms of token terms fetched from the
+ * underlying child stream; the child stream must deliver lowercase tokens
+ * for synonyms to be found.
+ * 
+ * @author whoschek.AT.lbl.DOT.gov
+ */
+public class SynonymTokenFilter extends TokenFilter {
+		
+	/** The Token.type used to indicate a synonym to higher level filters. */
+	public static final String SYNONYM_TOKEN_TYPE = "SYNONYM";
+
+	private final SynonymMap synonyms;
+	private final int maxSynonyms;
+	
+	private String[] stack = null;
+	private int index = 0;
+	private Token current = null;
+	private int todo = 0;
+	
+	/**
+	 * Creates an instance for the given underlying stream and synonym table.
+	 * 
+	 * @param input
+	 *            the underlying child token stream
+	 * @param synonyms
+	 *            the map used to extract synonyms for terms
+	 * @param maxSynonyms
+	 *            the maximum number of synonym tokens to return per underlying
+	 *            token word (a value of Integer.MAX_VALUE indicates unlimited)
+	 */
+	public SynonymTokenFilter(TokenStream input, SynonymMap synonyms, int maxSynonyms) {
+		super(input);
+		if (input == null)
+			throw new IllegalArgumentException("input must not be null");
+		if (synonyms == null)
+			throw new IllegalArgumentException("synonyms must not be null");
+		if (maxSynonyms < 0) 
+			throw new IllegalArgumentException("maxSynonyms must not be negative");
+		
+		this.synonyms = synonyms;
+		this.maxSynonyms = maxSynonyms;
+	}
+	
+	/** Returns the next token in the stream, or null at EOS. */
+	public Token next() throws IOException {
+		Token token;
+		while (todo > 0 && index < stack.length) { // pop from stack
+			token = createToken(stack[index++], current);
+			if (token != null) {
+				todo--;
+				return token;
+			}
+		}
+		
+		token = input.next();
+		if (token == null) return null; // EOS; iterator exhausted
+		
+		stack = synonyms.getSynonyms(token.termText()); // push onto stack
+		if (stack.length > maxSynonyms) randomize(stack);
+		index = 0;
+		current = token;
+		todo = maxSynonyms;
+		return token;
+	}
+	
+	/**
+	 * Creates and returns a token for the given synonym of the current input
+	 * token; Override for custom (stateless or stateful) behaviour, if desired.
+	 * 
+	 * @param synonym 
+	 *            a synonym for the current token's term
+	 * @param current
+	 *            the current token from the underlying child stream
+	 * @return a new token, or null to indicate that the given synonym should be
+	 *         ignored
+	 */
+	protected Token createToken(String synonym, Token current) {
+		Token token = new Token(
+			synonym, current.startOffset(), current.endOffset(), SYNONYM_TOKEN_TYPE);
+		token.setPositionIncrement(0);
+		return token;
+	}
+	
+	/**
+	 * Randomize synonyms to later sample a subset. Uses constant random seed
+	 * for reproducability. Uses "DRand", a simple, fast, uniform pseudo-random
+	 * number generator with medium statistical quality (multiplicative
+	 * congruential method), producing integers in the range [Integer.MIN_VALUE,
+	 * Integer.MAX_VALUE].
+	 */
+	private static void randomize(Object[] arr) {
+		int seed = 1234567; // constant
+		int randomState = 4*seed + 1;
+//		Random random = new Random(seed); // unnecessary overhead
+		int len = arr.length;
+		for (int i=0; i < len-1; i++) {
+			randomState *= 0x278DDE6D; // z(i+1)=a*z(i) (mod 2**32)
+			int r = randomState % (len-i);
+			if (r < 0) r = -r; // e.g. -9 % 2 == -1
+//			int r = random.nextInt(len-i);
+			
+			// swap arr[i, i+r]
+			Object tmp = arr[i];
+			arr[i] = arr[i + r];
+			arr[i + r] = tmp;
+		}		
+	}
+	
+}