git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@351896 13f79535-47bb-0310-9956-ffa450edef68

2005-12-03 05:44:16 +00:00 · 2005-12-03 05:44:16 +00:00 · ebe44ace90
parent a155416b4d
commit ebe44ace90
2 changed files with 529 additions and 0 deletions
--- a/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymMap.java
+++ b/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymMap.java
@ -0,0 +1,395 @@
 package org.apache.lucene.index.memory;
 /**
 * Copyright 2005 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.ByteBuffer;
 import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.TreeMap;
 import java.util.TreeSet;
 /**
 * Loads the <a target="_blank" 
 * href="http://www.cogsci.princeton.edu/~wn/">WordNet </a> prolog file <a
 * href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">wn_s.pl </a>
 * into a thread-safe main-memory hash map that can be used for fast
 * high-frequncy lookups of synonyms for any given (lowercase) word string.
 * <p>
 * There holds: If B is a synonym for A (A -> B) then A is also a synonym for B (B -> A).
 * There does not necessary hold: A -> B, B -> C then A -> C.
 * <p>
 * Loading typically takes some 1.5 secs, so should be done only once per
 * (server) program execution, using a singleton pattern. Once loaded, a
 * synonym lookup via {@link #getSynonyms(String)}takes constant time O(1).
 * A loaded default synonym map consumes about 10 MB main memory.
 * An instance is immutable, hence thread-safe.
 * <p>
 * This implementation borrows some ideas from the Lucene Syns2Index demo that 
 * Dave Spencer originally contributed to Lucene. Dave's approach
 * involved a persistent Lucene index which is suitable for occasional
 * lookups or very large synonym tables, but considered unsuitable for 
 * high-frequency lookups of medium size synonym tables.
 * <p>
 * Example Usage:
 * <pre>
 * String[] words = new String[] { "hard", "woods", "forest", "wolfish", "xxxx"};
 * SynonymMap map = new SynonymMap(new FileInputStream("samples/fulltext/wn_s.pl"));
 * for (int i = 0; i &lt; words.length; i++) {
 *     String[] synonyms = map.getSynonyms(words[i]);
 *     System.out.println(words[i] + ":" + java.util.Arrays.asList(synonyms).toString());
 * }
 * 
 * Example output:
 * hard:[arduous, backbreaking, difficult, fermented, firmly, grueling, gruelling, heavily, heavy, intemperately, knockout, laborious, punishing, severe, severely, strong, toilsome, tough]
 * woods:[forest, wood]
 * forest:[afforest, timber, timberland, wood, woodland, woods]
 * wolfish:[edacious, esurient, rapacious, ravening, ravenous, voracious, wolflike]
 * xxxx:[]
 * </pre>
 * 
 * @author whoschek.AT.lbl.DOT.gov
 * @see <a target="_blank"
 *      href="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html">prologdb
 *      man page </a>
 * @see <a target="_blank" href="http://www.hostmon.com/rfc/advanced.jsp">Dave's synonym demo site</a>
 */
 public class SynonymMap {
 	/** the index data; Map<String word, String[] synonyms> */
 	private final HashMap table;
 	private static final String[] EMPTY = new String[0];
 	private static final boolean DEBUG = false;
 	/**
 	 * Constructs an instance, loading WordNet synonym data from the given input
 	 * stream. Finally closes the stream. The words in the stream must be in
 	 * UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.).
 	 * 
 	 * @param input
 	 *            the stream to read from (null indicates an empty synonym map)
 	 * @throws IOException
 	 *             if an error occured while reading the stream.
 	 */
 	public SynonymMap(InputStream input) throws IOException {
 		this.table = input == null ? new HashMap(0) : read(toByteArray(input));
 	}
 	/**
 	 * Returns the synonym set for the given word, sorted ascending.
 	 * 
 	 * @param word
 	 *            the word to lookup (must be in lowercase).
 	 * @return the synonyms; a set of zero or more words, sorted ascending, each
 	 *         word containing lowercase characters that satisfy
 	 *         <code>Character.isLetter()</code>.
 	 */
 	public String[] getSynonyms(String word) {
 		Object syns = table.get(word);
 		if (syns == null) return EMPTY;
 		if (syns instanceof String) return new String[] {(String) syns};
 		String[] synonyms = (String[]) syns;
 		String[] copy = new String[synonyms.length]; // copy for guaranteed immutability
 		System.arraycopy(synonyms, 0, copy, 0, synonyms.length);
 		return copy;
 	}
 	/** Returns a String representation of the index data for debugging purposes. */
 	public String toString() {
 		StringBuffer buf = new StringBuffer();
 		Iterator iter = new TreeMap(table).keySet().iterator();
 		int count = 0;
 		int f0 = 0;
 		int f1 = 0;
 		int f2 = 0;
 		int f3 = 0;
 		while (iter.hasNext()) {
 			String word = (String) iter.next();
 			buf.append(word + ":");
 			String[] synonyms = getSynonyms(word);
 			buf.append(Arrays.asList(synonyms));
 			buf.append("\n");
 			count += synonyms.length;
 			if (synonyms.length == 0) f0++;
 			if (synonyms.length == 1) f1++;
 			if (synonyms.length == 2) f2++;
 			if (synonyms.length == 3) f3++;
 		}
 		buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3);
 		return buf.toString();
 	}
 	/**
 	 * Analyzes/transforms the given word on input stream loading. This default implementation simply
 	 * lowercases the word. Override this method with a custom stemming
 	 * algorithm or similar, if desired.
 	 * 
 	 * @param word
 	 *            the word to analyze
 	 * @return the same word, or a different word (or null to indicate that the
 	 *         word should be ignored)
 	 */
 	protected String analyze(String word) {
 		return word.toLowerCase();
 	}
 	private static boolean isValid(String str) {
 		for (int i=str.length(); --i >= 0; ) {
 			if (!Character.isLetter(str.charAt(i))) return false;
 		}
 		return true;
 	}
 	private HashMap read(byte[] data) {
 		int WORDS  = (int) (76401 / 0.7); // presizing
 		int GROUPS = (int) (88022 / 0.7); // presizing
 		HashMap word2Groups = new HashMap(WORDS);  // Map<String word, int[] groups>
 		HashMap group2Words = new HashMap(GROUPS); // Map<int group, String[] words>
 		HashMap internedWords = new HashMap(WORDS);// Map<String word, String word>
 		Charset charset = Charset.forName("UTF-8");
 		int lastNum = -1;
 		Integer lastGroup = null;
 		int len = data.length;
 		int i=0;
 		while (i < len) { // until EOF
 			/* Part A: Parse a line */
 			// scan to beginning of group
 			while (i < len && data[i] != '(') i++;
 			if (i >= len) break; // EOF
 			i++;
 			// parse group
 			int num = 0;
 			while (i < len && data[i] != ',') {
 				num = 10*num + (data[i] - 48);
 				i++;
 			}
 			i++;
 //			if (DEBUG) System.err.println("num="+ num);
 			// scan to beginning of word
 			while (i < len && data[i] != '\'') i++;
 			i++;
 			// scan to end of word
 			int start = i;
 			do {
 				while (i < len && data[i] != '\'') i++;
 				i++;
 			} while (i < len && data[i] != ','); // word must end with "',"
 			if (i >= len) break; // EOF
 			String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString();
 //			String word = new String(data, 0, start, i-start-1); // ASCII
 			/*
 			 * Part B: ignore phrases (with spaces and hyphens) and
 			 * non-alphabetic words, and let user customize word (e.g. do some
 			 * stemming)
 			 */
 			if (!isValid(word)) continue; // ignore
 			word = analyze(word);
 			if (word == null || word.length() == 0) continue; // ignore
 			/* Part C: Add (group,word) to tables */
 			// ensure compact string representation, minimizing memory overhead
 			String w = (String) internedWords.get(word);
 			if (w == null) {
 				word = new String(word); // ensure compact string
 				internedWords.put(word, word);
 			} else {
 				word = w;
 			}
 			Integer group = lastGroup;
 			if (num != lastNum) {
 				group = new Integer(num);
 				lastGroup = group;
 				lastNum = num;
 			}
 			// add word --> group
 			ArrayList groups = (ArrayList) word2Groups.get(word);
 			if (groups == null) {
 				groups = new ArrayList(1);
 				word2Groups.put(word, groups);
 			}
 			groups.add(group);
 			// add group --> word
 			ArrayList words = (ArrayList) group2Words.get(group);
 			if (words == null) {
 				words = new ArrayList(1);
 				group2Words.put(group, words);
 			} 
 			words.add(word);
 		}
 		/* Part D: compute index data structure */
 		HashMap word2Syns = createIndex(word2Groups, group2Words);		
 		/* Part E: minimize memory consumption by a factor 3 (or so) */
 //		if (true) return word2Syns;
 		word2Groups = null; // help gc
 		group2Words = null; // help gc		
 		return optimize(word2Syns, internedWords);
 	}
 	private HashMap createIndex(Map word2Groups, Map group2Words) {
 		HashMap word2Syns = new HashMap();
 		Iterator iter = word2Groups.entrySet().iterator();
 		while (iter.hasNext()) { // for each word
 			Map.Entry entry = (Map.Entry) iter.next();
 			ArrayList group = (ArrayList) entry.getValue();			
 			String word = (String) entry.getKey();
 //			HashSet synonyms = new HashSet();
 			TreeSet synonyms = new TreeSet();
 			for (int i=group.size(); --i >= 0; ) { // for each groupID of word
 				ArrayList words = (ArrayList) group2Words.get(group.get(i));
 				for (int j=words.size(); --j >= 0; ) { // add all words				
 					Object synonym = words.get(j); // note that w and word are interned
 					if (synonym != word) { // a word is implicitly it's own synonym
 						synonyms.add(synonym);
 					}
 				}
 			}
 			int size = synonyms.size();
 			if (size > 0) {
 				String[] syns = new String[size];
 				if (size == 1)  
 					syns[0] = (String) synonyms.first();
 				else
 					synonyms.toArray(syns);
 //				if (syns.length > 1) Arrays.sort(syns);
 //				if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns));
 				word2Syns.put(word, syns);
 			}
 		}
 		return word2Syns;
 	}
 	private HashMap optimize(HashMap word2Syns, HashMap internedWords) {
 		if (DEBUG) {
 			System.err.println("before gc");
 			for (int i=0; i < 10; i++) System.gc();
 			System.err.println("after gc");
 		}
 		// collect entries
 		int len = 0;
 		int size = word2Syns.size();
 		String[][] allSynonyms = new String[size][];
 		String[] words = new String[size];
 		Iterator iter = word2Syns.entrySet().iterator();
 		for (int j=0; j < size; j++) {
 			Map.Entry entry = (Map.Entry) iter.next();
 			allSynonyms[j] = (String[]) entry.getValue(); 
 			words[j] = (String) entry.getKey();
 			len += words[j].length();
 		}
 		// assemble large string containing all words
 		StringBuffer buf = new StringBuffer(len);
 		for (int j=0; j < size; j++) buf.append(words[j]);
 		String allWords = new String(buf.toString()); // ensure compact string across JDK versions
 		buf = null;
 		// intern words at app level via memory-overlaid substrings
 		for (int p=0, j=0; j < size; j++) {
 			String word = words[j];
 			internedWords.put(word, allWords.substring(p, p + word.length()));
 			p += word.length();
 		}
 		// replace words with interned words
 		for (int j=0; j < size; j++) {
 			String[] syns = allSynonyms[j];
 			for (int k=syns.length; --k >= 0; ) {
 				syns[k] = (String) internedWords.get(syns[k]);
 			}
 			Object replacement = syns;
 			if (syns.length == 1) replacement = syns[0]; // minimize memory consumption some more
 			word2Syns.remove(words[j]);
 			word2Syns.put(internedWords.get(words[j]), replacement);
 		}
 		if (DEBUG) {
 			words = null;
 			allSynonyms = null;
 			internedWords = null;
 			allWords = null;
 			System.err.println("before gc");
 			for (int i=0; i < 10; i++) System.gc();
 			System.err.println("after gc");
 		}
 		return word2Syns;
 	}
 	// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
 	private static byte[] toByteArray(InputStream input) throws IOException {
 		try {
 			// safe and fast even if input.available() behaves weird or buggy
 			int len = Math.max(256, input.available());
 			byte[] buffer = new byte[len];
 			byte[] output = new byte[len];
 			len = 0;
 			int n;
 			while ((n = input.read(buffer)) >= 0) {
 				if (len + n > output.length) { // grow capacity
 					byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
 					System.arraycopy(output, 0, tmp, 0, len);
 					System.arraycopy(buffer, 0, tmp, len, n);
 					buffer = output; // use larger buffer for future larger bulk reads
 					output = tmp;
 				} else {
 					System.arraycopy(buffer, 0, output, len, n);
 				}
 				len += n;
 			}
 			if (len == output.length) return output;
 			buffer = null; // help gc
 			buffer = new byte[len];
 			System.arraycopy(output, 0, buffer, 0, len);
 			return buffer;
 		} finally {
 			if (input != null) input.close();
 		}
 	}
 }
--- a/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java
+++ b/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java
@ -0,0 +1,134 @@
 package org.apache.lucene.index.memory;
 /**
 * Copyright 2005 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 /**
 * Injects additional tokens for synonyms of token terms fetched from the
 * underlying child stream; the child stream must deliver lowercase tokens
 * for synonyms to be found.
 * 
 * @author whoschek.AT.lbl.DOT.gov
 */
 public class SynonymTokenFilter extends TokenFilter {
 	/** The Token.type used to indicate a synonym to higher level filters. */
 	public static final String SYNONYM_TOKEN_TYPE = "SYNONYM";
 	private final SynonymMap synonyms;
 	private final int maxSynonyms;
 	private String[] stack = null;
 	private int index = 0;
 	private Token current = null;
 	private int todo = 0;
 	/**
 	 * Creates an instance for the given underlying stream and synonym table.
 	 * 
 	 * @param input
 	 *            the underlying child token stream
 	 * @param synonyms
 	 *            the map used to extract synonyms for terms
 	 * @param maxSynonyms
 	 *            the maximum number of synonym tokens to return per underlying
 	 *            token word (a value of Integer.MAX_VALUE indicates unlimited)
 	 */
 	public SynonymTokenFilter(TokenStream input, SynonymMap synonyms, int maxSynonyms) {
 		super(input);
 		if (input == null)
 			throw new IllegalArgumentException("input must not be null");
 		if (synonyms == null)
 			throw new IllegalArgumentException("synonyms must not be null");
 		if (maxSynonyms < 0) 
 			throw new IllegalArgumentException("maxSynonyms must not be negative");
 		this.synonyms = synonyms;
 		this.maxSynonyms = maxSynonyms;
 	}
 	/** Returns the next token in the stream, or null at EOS. */
 	public Token next() throws IOException {
 		Token token;
 		while (todo > 0 && index < stack.length) { // pop from stack
 			token = createToken(stack[index++], current);
 			if (token != null) {
 				todo--;
 				return token;
 			}
 		}
 		token = input.next();
 		if (token == null) return null; // EOS; iterator exhausted
 		stack = synonyms.getSynonyms(token.termText()); // push onto stack
 		if (stack.length > maxSynonyms) randomize(stack);
 		index = 0;
 		current = token;
 		todo = maxSynonyms;
 		return token;
 	}
 	/**
 	 * Creates and returns a token for the given synonym of the current input
 	 * token; Override for custom (stateless or stateful) behaviour, if desired.
 	 * 
 	 * @param synonym 
 	 *            a synonym for the current token's term
 	 * @param current
 	 *            the current token from the underlying child stream
 	 * @return a new token, or null to indicate that the given synonym should be
 	 *         ignored
 	 */
 	protected Token createToken(String synonym, Token current) {
 		Token token = new Token(
 			synonym, current.startOffset(), current.endOffset(), SYNONYM_TOKEN_TYPE);
 		token.setPositionIncrement(0);
 		return token;
 	}
 	/**
 	 * Randomize synonyms to later sample a subset. Uses constant random seed
 	 * for reproducability. Uses "DRand", a simple, fast, uniform pseudo-random
 	 * number generator with medium statistical quality (multiplicative
 	 * congruential method), producing integers in the range [Integer.MIN_VALUE,
 	 * Integer.MAX_VALUE].
 	 */
 	private static void randomize(Object[] arr) {
 		int seed = 1234567; // constant
 		int randomState = 4*seed + 1;
 //		Random random = new Random(seed); // unnecessary overhead
 		int len = arr.length;
 		for (int i=0; i < len-1; i++) {
 			randomState *= 0x278DDE6D; // z(i+1)=a*z(i) (mod 2**32)
 			int r = randomState % (len-i);
 			if (r < 0) r = -r; // e.g. -9 % 2 == -1
 //			int r = random.nextInt(len-i);
 			// swap arr[i, i+r]
 			Object tmp = arr[i];
 			arr[i] = arr[i + r];
 			arr[i + r] = tmp;
 		}		
 	}
 }