git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@351896 13f79535-47bb-0310-9956-ffa450edef68

This commit is contained in:
Wolfgang Hoschek 2005-12-03 05:44:16 +00:00
parent a155416b4d
commit ebe44ace90
2 changed files with 529 additions and 0 deletions

View File

@ -0,0 +1,395 @@
package org.apache.lucene.index.memory;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;
/**
* Loads the <a target="_blank"
* href="http://www.cogsci.princeton.edu/~wn/">WordNet </a> prolog file <a
* href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">wn_s.pl </a>
* into a thread-safe main-memory hash map that can be used for fast
* high-frequncy lookups of synonyms for any given (lowercase) word string.
* <p>
* There holds: If B is a synonym for A (A -> B) then A is also a synonym for B (B -> A).
* There does not necessary hold: A -> B, B -> C then A -> C.
* <p>
* Loading typically takes some 1.5 secs, so should be done only once per
* (server) program execution, using a singleton pattern. Once loaded, a
* synonym lookup via {@link #getSynonyms(String)}takes constant time O(1).
* A loaded default synonym map consumes about 10 MB main memory.
* An instance is immutable, hence thread-safe.
* <p>
* This implementation borrows some ideas from the Lucene Syns2Index demo that
* Dave Spencer originally contributed to Lucene. Dave's approach
* involved a persistent Lucene index which is suitable for occasional
* lookups or very large synonym tables, but considered unsuitable for
* high-frequency lookups of medium size synonym tables.
* <p>
* Example Usage:
* <pre>
* String[] words = new String[] { "hard", "woods", "forest", "wolfish", "xxxx"};
* SynonymMap map = new SynonymMap(new FileInputStream("samples/fulltext/wn_s.pl"));
* for (int i = 0; i &lt; words.length; i++) {
* String[] synonyms = map.getSynonyms(words[i]);
* System.out.println(words[i] + ":" + java.util.Arrays.asList(synonyms).toString());
* }
*
* Example output:
* hard:[arduous, backbreaking, difficult, fermented, firmly, grueling, gruelling, heavily, heavy, intemperately, knockout, laborious, punishing, severe, severely, strong, toilsome, tough]
* woods:[forest, wood]
* forest:[afforest, timber, timberland, wood, woodland, woods]
* wolfish:[edacious, esurient, rapacious, ravening, ravenous, voracious, wolflike]
* xxxx:[]
* </pre>
*
* @author whoschek.AT.lbl.DOT.gov
* @see <a target="_blank"
* href="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html">prologdb
* man page </a>
* @see <a target="_blank" href="http://www.hostmon.com/rfc/advanced.jsp">Dave's synonym demo site</a>
*/
public class SynonymMap {
/** the index data; Map<String word, String[] synonyms> */
private final HashMap table;
private static final String[] EMPTY = new String[0];
private static final boolean DEBUG = false;
/**
* Constructs an instance, loading WordNet synonym data from the given input
* stream. Finally closes the stream. The words in the stream must be in
* UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.).
*
* @param input
* the stream to read from (null indicates an empty synonym map)
* @throws IOException
* if an error occured while reading the stream.
*/
public SynonymMap(InputStream input) throws IOException {
this.table = input == null ? new HashMap(0) : read(toByteArray(input));
}
/**
* Returns the synonym set for the given word, sorted ascending.
*
* @param word
* the word to lookup (must be in lowercase).
* @return the synonyms; a set of zero or more words, sorted ascending, each
* word containing lowercase characters that satisfy
* <code>Character.isLetter()</code>.
*/
public String[] getSynonyms(String word) {
Object syns = table.get(word);
if (syns == null) return EMPTY;
if (syns instanceof String) return new String[] {(String) syns};
String[] synonyms = (String[]) syns;
String[] copy = new String[synonyms.length]; // copy for guaranteed immutability
System.arraycopy(synonyms, 0, copy, 0, synonyms.length);
return copy;
}
/** Returns a String representation of the index data for debugging purposes. */
public String toString() {
StringBuffer buf = new StringBuffer();
Iterator iter = new TreeMap(table).keySet().iterator();
int count = 0;
int f0 = 0;
int f1 = 0;
int f2 = 0;
int f3 = 0;
while (iter.hasNext()) {
String word = (String) iter.next();
buf.append(word + ":");
String[] synonyms = getSynonyms(word);
buf.append(Arrays.asList(synonyms));
buf.append("\n");
count += synonyms.length;
if (synonyms.length == 0) f0++;
if (synonyms.length == 1) f1++;
if (synonyms.length == 2) f2++;
if (synonyms.length == 3) f3++;
}
buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3);
return buf.toString();
}
/**
* Analyzes/transforms the given word on input stream loading. This default implementation simply
* lowercases the word. Override this method with a custom stemming
* algorithm or similar, if desired.
*
* @param word
* the word to analyze
* @return the same word, or a different word (or null to indicate that the
* word should be ignored)
*/
protected String analyze(String word) {
return word.toLowerCase();
}
private static boolean isValid(String str) {
for (int i=str.length(); --i >= 0; ) {
if (!Character.isLetter(str.charAt(i))) return false;
}
return true;
}
private HashMap read(byte[] data) {
int WORDS = (int) (76401 / 0.7); // presizing
int GROUPS = (int) (88022 / 0.7); // presizing
HashMap word2Groups = new HashMap(WORDS); // Map<String word, int[] groups>
HashMap group2Words = new HashMap(GROUPS); // Map<int group, String[] words>
HashMap internedWords = new HashMap(WORDS);// Map<String word, String word>
Charset charset = Charset.forName("UTF-8");
int lastNum = -1;
Integer lastGroup = null;
int len = data.length;
int i=0;
while (i < len) { // until EOF
/* Part A: Parse a line */
// scan to beginning of group
while (i < len && data[i] != '(') i++;
if (i >= len) break; // EOF
i++;
// parse group
int num = 0;
while (i < len && data[i] != ',') {
num = 10*num + (data[i] - 48);
i++;
}
i++;
// if (DEBUG) System.err.println("num="+ num);
// scan to beginning of word
while (i < len && data[i] != '\'') i++;
i++;
// scan to end of word
int start = i;
do {
while (i < len && data[i] != '\'') i++;
i++;
} while (i < len && data[i] != ','); // word must end with "',"
if (i >= len) break; // EOF
String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString();
// String word = new String(data, 0, start, i-start-1); // ASCII
/*
* Part B: ignore phrases (with spaces and hyphens) and
* non-alphabetic words, and let user customize word (e.g. do some
* stemming)
*/
if (!isValid(word)) continue; // ignore
word = analyze(word);
if (word == null || word.length() == 0) continue; // ignore
/* Part C: Add (group,word) to tables */
// ensure compact string representation, minimizing memory overhead
String w = (String) internedWords.get(word);
if (w == null) {
word = new String(word); // ensure compact string
internedWords.put(word, word);
} else {
word = w;
}
Integer group = lastGroup;
if (num != lastNum) {
group = new Integer(num);
lastGroup = group;
lastNum = num;
}
// add word --> group
ArrayList groups = (ArrayList) word2Groups.get(word);
if (groups == null) {
groups = new ArrayList(1);
word2Groups.put(word, groups);
}
groups.add(group);
// add group --> word
ArrayList words = (ArrayList) group2Words.get(group);
if (words == null) {
words = new ArrayList(1);
group2Words.put(group, words);
}
words.add(word);
}
/* Part D: compute index data structure */
HashMap word2Syns = createIndex(word2Groups, group2Words);
/* Part E: minimize memory consumption by a factor 3 (or so) */
// if (true) return word2Syns;
word2Groups = null; // help gc
group2Words = null; // help gc
return optimize(word2Syns, internedWords);
}
private HashMap createIndex(Map word2Groups, Map group2Words) {
HashMap word2Syns = new HashMap();
Iterator iter = word2Groups.entrySet().iterator();
while (iter.hasNext()) { // for each word
Map.Entry entry = (Map.Entry) iter.next();
ArrayList group = (ArrayList) entry.getValue();
String word = (String) entry.getKey();
// HashSet synonyms = new HashSet();
TreeSet synonyms = new TreeSet();
for (int i=group.size(); --i >= 0; ) { // for each groupID of word
ArrayList words = (ArrayList) group2Words.get(group.get(i));
for (int j=words.size(); --j >= 0; ) { // add all words
Object synonym = words.get(j); // note that w and word are interned
if (synonym != word) { // a word is implicitly it's own synonym
synonyms.add(synonym);
}
}
}
int size = synonyms.size();
if (size > 0) {
String[] syns = new String[size];
if (size == 1)
syns[0] = (String) synonyms.first();
else
synonyms.toArray(syns);
// if (syns.length > 1) Arrays.sort(syns);
// if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns));
word2Syns.put(word, syns);
}
}
return word2Syns;
}
private HashMap optimize(HashMap word2Syns, HashMap internedWords) {
if (DEBUG) {
System.err.println("before gc");
for (int i=0; i < 10; i++) System.gc();
System.err.println("after gc");
}
// collect entries
int len = 0;
int size = word2Syns.size();
String[][] allSynonyms = new String[size][];
String[] words = new String[size];
Iterator iter = word2Syns.entrySet().iterator();
for (int j=0; j < size; j++) {
Map.Entry entry = (Map.Entry) iter.next();
allSynonyms[j] = (String[]) entry.getValue();
words[j] = (String) entry.getKey();
len += words[j].length();
}
// assemble large string containing all words
StringBuffer buf = new StringBuffer(len);
for (int j=0; j < size; j++) buf.append(words[j]);
String allWords = new String(buf.toString()); // ensure compact string across JDK versions
buf = null;
// intern words at app level via memory-overlaid substrings
for (int p=0, j=0; j < size; j++) {
String word = words[j];
internedWords.put(word, allWords.substring(p, p + word.length()));
p += word.length();
}
// replace words with interned words
for (int j=0; j < size; j++) {
String[] syns = allSynonyms[j];
for (int k=syns.length; --k >= 0; ) {
syns[k] = (String) internedWords.get(syns[k]);
}
Object replacement = syns;
if (syns.length == 1) replacement = syns[0]; // minimize memory consumption some more
word2Syns.remove(words[j]);
word2Syns.put(internedWords.get(words[j]), replacement);
}
if (DEBUG) {
words = null;
allSynonyms = null;
internedWords = null;
allWords = null;
System.err.println("before gc");
for (int i=0; i < 10; i++) System.gc();
System.err.println("after gc");
}
return word2Syns;
}
// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
private static byte[] toByteArray(InputStream input) throws IOException {
try {
// safe and fast even if input.available() behaves weird or buggy
int len = Math.max(256, input.available());
byte[] buffer = new byte[len];
byte[] output = new byte[len];
len = 0;
int n;
while ((n = input.read(buffer)) >= 0) {
if (len + n > output.length) { // grow capacity
byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
System.arraycopy(output, 0, tmp, 0, len);
System.arraycopy(buffer, 0, tmp, len, n);
buffer = output; // use larger buffer for future larger bulk reads
output = tmp;
} else {
System.arraycopy(buffer, 0, output, len, n);
}
len += n;
}
if (len == output.length) return output;
buffer = null; // help gc
buffer = new byte[len];
System.arraycopy(output, 0, buffer, 0, len);
return buffer;
} finally {
if (input != null) input.close();
}
}
}

View File

@ -0,0 +1,134 @@
package org.apache.lucene.index.memory;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
/**
* Injects additional tokens for synonyms of token terms fetched from the
* underlying child stream; the child stream must deliver lowercase tokens
* for synonyms to be found.
*
* @author whoschek.AT.lbl.DOT.gov
*/
public class SynonymTokenFilter extends TokenFilter {
/** The Token.type used to indicate a synonym to higher level filters. */
public static final String SYNONYM_TOKEN_TYPE = "SYNONYM";
private final SynonymMap synonyms;
private final int maxSynonyms;
private String[] stack = null;
private int index = 0;
private Token current = null;
private int todo = 0;
/**
* Creates an instance for the given underlying stream and synonym table.
*
* @param input
* the underlying child token stream
* @param synonyms
* the map used to extract synonyms for terms
* @param maxSynonyms
* the maximum number of synonym tokens to return per underlying
* token word (a value of Integer.MAX_VALUE indicates unlimited)
*/
public SynonymTokenFilter(TokenStream input, SynonymMap synonyms, int maxSynonyms) {
super(input);
if (input == null)
throw new IllegalArgumentException("input must not be null");
if (synonyms == null)
throw new IllegalArgumentException("synonyms must not be null");
if (maxSynonyms < 0)
throw new IllegalArgumentException("maxSynonyms must not be negative");
this.synonyms = synonyms;
this.maxSynonyms = maxSynonyms;
}
/** Returns the next token in the stream, or null at EOS. */
public Token next() throws IOException {
Token token;
while (todo > 0 && index < stack.length) { // pop from stack
token = createToken(stack[index++], current);
if (token != null) {
todo--;
return token;
}
}
token = input.next();
if (token == null) return null; // EOS; iterator exhausted
stack = synonyms.getSynonyms(token.termText()); // push onto stack
if (stack.length > maxSynonyms) randomize(stack);
index = 0;
current = token;
todo = maxSynonyms;
return token;
}
/**
* Creates and returns a token for the given synonym of the current input
* token; Override for custom (stateless or stateful) behaviour, if desired.
*
* @param synonym
* a synonym for the current token's term
* @param current
* the current token from the underlying child stream
* @return a new token, or null to indicate that the given synonym should be
* ignored
*/
protected Token createToken(String synonym, Token current) {
Token token = new Token(
synonym, current.startOffset(), current.endOffset(), SYNONYM_TOKEN_TYPE);
token.setPositionIncrement(0);
return token;
}
/**
* Randomize synonyms to later sample a subset. Uses constant random seed
* for reproducability. Uses "DRand", a simple, fast, uniform pseudo-random
* number generator with medium statistical quality (multiplicative
* congruential method), producing integers in the range [Integer.MIN_VALUE,
* Integer.MAX_VALUE].
*/
private static void randomize(Object[] arr) {
int seed = 1234567; // constant
int randomState = 4*seed + 1;
// Random random = new Random(seed); // unnecessary overhead
int len = arr.length;
for (int i=0; i < len-1; i++) {
randomState *= 0x278DDE6D; // z(i+1)=a*z(i) (mod 2**32)
int r = randomState % (len-i);
if (r < 0) r = -r; // e.g. -9 % 2 == -1
// int r = random.nextInt(len-i);
// swap arr[i, i+r]
Object tmp = arr[i];
arr[i] = arr[i + r];
arr[i + r] = tmp;
}
}
}