mirror of https://github.com/apache/lucene.git
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@351896 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a155416b4d
commit
ebe44ace90
|
@ -0,0 +1,395 @@
|
|||
package org.apache.lucene.index.memory;
|
||||
|
||||
/**
|
||||
* Copyright 2005 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
|
||||
/**
|
||||
* Loads the <a target="_blank"
|
||||
* href="http://www.cogsci.princeton.edu/~wn/">WordNet </a> prolog file <a
|
||||
* href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">wn_s.pl </a>
|
||||
* into a thread-safe main-memory hash map that can be used for fast
|
||||
* high-frequncy lookups of synonyms for any given (lowercase) word string.
|
||||
* <p>
|
||||
* There holds: If B is a synonym for A (A -> B) then A is also a synonym for B (B -> A).
|
||||
* There does not necessary hold: A -> B, B -> C then A -> C.
|
||||
* <p>
|
||||
* Loading typically takes some 1.5 secs, so should be done only once per
|
||||
* (server) program execution, using a singleton pattern. Once loaded, a
|
||||
* synonym lookup via {@link #getSynonyms(String)}takes constant time O(1).
|
||||
* A loaded default synonym map consumes about 10 MB main memory.
|
||||
* An instance is immutable, hence thread-safe.
|
||||
* <p>
|
||||
* This implementation borrows some ideas from the Lucene Syns2Index demo that
|
||||
* Dave Spencer originally contributed to Lucene. Dave's approach
|
||||
* involved a persistent Lucene index which is suitable for occasional
|
||||
* lookups or very large synonym tables, but considered unsuitable for
|
||||
* high-frequency lookups of medium size synonym tables.
|
||||
* <p>
|
||||
* Example Usage:
|
||||
* <pre>
|
||||
* String[] words = new String[] { "hard", "woods", "forest", "wolfish", "xxxx"};
|
||||
* SynonymMap map = new SynonymMap(new FileInputStream("samples/fulltext/wn_s.pl"));
|
||||
* for (int i = 0; i < words.length; i++) {
|
||||
* String[] synonyms = map.getSynonyms(words[i]);
|
||||
* System.out.println(words[i] + ":" + java.util.Arrays.asList(synonyms).toString());
|
||||
* }
|
||||
*
|
||||
* Example output:
|
||||
* hard:[arduous, backbreaking, difficult, fermented, firmly, grueling, gruelling, heavily, heavy, intemperately, knockout, laborious, punishing, severe, severely, strong, toilsome, tough]
|
||||
* woods:[forest, wood]
|
||||
* forest:[afforest, timber, timberland, wood, woodland, woods]
|
||||
* wolfish:[edacious, esurient, rapacious, ravening, ravenous, voracious, wolflike]
|
||||
* xxxx:[]
|
||||
* </pre>
|
||||
*
|
||||
* @author whoschek.AT.lbl.DOT.gov
|
||||
* @see <a target="_blank"
|
||||
* href="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html">prologdb
|
||||
* man page </a>
|
||||
* @see <a target="_blank" href="http://www.hostmon.com/rfc/advanced.jsp">Dave's synonym demo site</a>
|
||||
*/
|
||||
public class SynonymMap {
|
||||
|
||||
/** the index data; Map<String word, String[] synonyms> */
|
||||
private final HashMap table;
|
||||
|
||||
private static final String[] EMPTY = new String[0];
|
||||
|
||||
private static final boolean DEBUG = false;
|
||||
|
||||
/**
|
||||
* Constructs an instance, loading WordNet synonym data from the given input
|
||||
* stream. Finally closes the stream. The words in the stream must be in
|
||||
* UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.).
|
||||
*
|
||||
* @param input
|
||||
* the stream to read from (null indicates an empty synonym map)
|
||||
* @throws IOException
|
||||
* if an error occured while reading the stream.
|
||||
*/
|
||||
public SynonymMap(InputStream input) throws IOException {
|
||||
this.table = input == null ? new HashMap(0) : read(toByteArray(input));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the synonym set for the given word, sorted ascending.
|
||||
*
|
||||
* @param word
|
||||
* the word to lookup (must be in lowercase).
|
||||
* @return the synonyms; a set of zero or more words, sorted ascending, each
|
||||
* word containing lowercase characters that satisfy
|
||||
* <code>Character.isLetter()</code>.
|
||||
*/
|
||||
public String[] getSynonyms(String word) {
|
||||
Object syns = table.get(word);
|
||||
if (syns == null) return EMPTY;
|
||||
if (syns instanceof String) return new String[] {(String) syns};
|
||||
|
||||
String[] synonyms = (String[]) syns;
|
||||
String[] copy = new String[synonyms.length]; // copy for guaranteed immutability
|
||||
System.arraycopy(synonyms, 0, copy, 0, synonyms.length);
|
||||
return copy;
|
||||
}
|
||||
|
||||
/** Returns a String representation of the index data for debugging purposes. */
|
||||
public String toString() {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
Iterator iter = new TreeMap(table).keySet().iterator();
|
||||
int count = 0;
|
||||
int f0 = 0;
|
||||
int f1 = 0;
|
||||
int f2 = 0;
|
||||
int f3 = 0;
|
||||
|
||||
while (iter.hasNext()) {
|
||||
String word = (String) iter.next();
|
||||
buf.append(word + ":");
|
||||
String[] synonyms = getSynonyms(word);
|
||||
buf.append(Arrays.asList(synonyms));
|
||||
buf.append("\n");
|
||||
count += synonyms.length;
|
||||
if (synonyms.length == 0) f0++;
|
||||
if (synonyms.length == 1) f1++;
|
||||
if (synonyms.length == 2) f2++;
|
||||
if (synonyms.length == 3) f3++;
|
||||
}
|
||||
|
||||
buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3);
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Analyzes/transforms the given word on input stream loading. This default implementation simply
|
||||
* lowercases the word. Override this method with a custom stemming
|
||||
* algorithm or similar, if desired.
|
||||
*
|
||||
* @param word
|
||||
* the word to analyze
|
||||
* @return the same word, or a different word (or null to indicate that the
|
||||
* word should be ignored)
|
||||
*/
|
||||
protected String analyze(String word) {
|
||||
return word.toLowerCase();
|
||||
}
|
||||
|
||||
private static boolean isValid(String str) {
|
||||
for (int i=str.length(); --i >= 0; ) {
|
||||
if (!Character.isLetter(str.charAt(i))) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private HashMap read(byte[] data) {
|
||||
int WORDS = (int) (76401 / 0.7); // presizing
|
||||
int GROUPS = (int) (88022 / 0.7); // presizing
|
||||
HashMap word2Groups = new HashMap(WORDS); // Map<String word, int[] groups>
|
||||
HashMap group2Words = new HashMap(GROUPS); // Map<int group, String[] words>
|
||||
HashMap internedWords = new HashMap(WORDS);// Map<String word, String word>
|
||||
|
||||
Charset charset = Charset.forName("UTF-8");
|
||||
int lastNum = -1;
|
||||
Integer lastGroup = null;
|
||||
int len = data.length;
|
||||
int i=0;
|
||||
|
||||
while (i < len) { // until EOF
|
||||
/* Part A: Parse a line */
|
||||
|
||||
// scan to beginning of group
|
||||
while (i < len && data[i] != '(') i++;
|
||||
if (i >= len) break; // EOF
|
||||
i++;
|
||||
|
||||
// parse group
|
||||
int num = 0;
|
||||
while (i < len && data[i] != ',') {
|
||||
num = 10*num + (data[i] - 48);
|
||||
i++;
|
||||
}
|
||||
i++;
|
||||
// if (DEBUG) System.err.println("num="+ num);
|
||||
|
||||
// scan to beginning of word
|
||||
while (i < len && data[i] != '\'') i++;
|
||||
i++;
|
||||
|
||||
// scan to end of word
|
||||
int start = i;
|
||||
do {
|
||||
while (i < len && data[i] != '\'') i++;
|
||||
i++;
|
||||
} while (i < len && data[i] != ','); // word must end with "',"
|
||||
|
||||
if (i >= len) break; // EOF
|
||||
String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString();
|
||||
// String word = new String(data, 0, start, i-start-1); // ASCII
|
||||
|
||||
/*
|
||||
* Part B: ignore phrases (with spaces and hyphens) and
|
||||
* non-alphabetic words, and let user customize word (e.g. do some
|
||||
* stemming)
|
||||
*/
|
||||
if (!isValid(word)) continue; // ignore
|
||||
word = analyze(word);
|
||||
if (word == null || word.length() == 0) continue; // ignore
|
||||
|
||||
|
||||
/* Part C: Add (group,word) to tables */
|
||||
|
||||
// ensure compact string representation, minimizing memory overhead
|
||||
String w = (String) internedWords.get(word);
|
||||
if (w == null) {
|
||||
word = new String(word); // ensure compact string
|
||||
internedWords.put(word, word);
|
||||
} else {
|
||||
word = w;
|
||||
}
|
||||
|
||||
Integer group = lastGroup;
|
||||
if (num != lastNum) {
|
||||
group = new Integer(num);
|
||||
lastGroup = group;
|
||||
lastNum = num;
|
||||
}
|
||||
|
||||
// add word --> group
|
||||
ArrayList groups = (ArrayList) word2Groups.get(word);
|
||||
if (groups == null) {
|
||||
groups = new ArrayList(1);
|
||||
word2Groups.put(word, groups);
|
||||
}
|
||||
groups.add(group);
|
||||
|
||||
// add group --> word
|
||||
ArrayList words = (ArrayList) group2Words.get(group);
|
||||
if (words == null) {
|
||||
words = new ArrayList(1);
|
||||
group2Words.put(group, words);
|
||||
}
|
||||
words.add(word);
|
||||
}
|
||||
|
||||
|
||||
/* Part D: compute index data structure */
|
||||
HashMap word2Syns = createIndex(word2Groups, group2Words);
|
||||
|
||||
/* Part E: minimize memory consumption by a factor 3 (or so) */
|
||||
// if (true) return word2Syns;
|
||||
word2Groups = null; // help gc
|
||||
group2Words = null; // help gc
|
||||
return optimize(word2Syns, internedWords);
|
||||
}
|
||||
|
||||
private HashMap createIndex(Map word2Groups, Map group2Words) {
|
||||
HashMap word2Syns = new HashMap();
|
||||
Iterator iter = word2Groups.entrySet().iterator();
|
||||
|
||||
while (iter.hasNext()) { // for each word
|
||||
Map.Entry entry = (Map.Entry) iter.next();
|
||||
ArrayList group = (ArrayList) entry.getValue();
|
||||
String word = (String) entry.getKey();
|
||||
|
||||
// HashSet synonyms = new HashSet();
|
||||
TreeSet synonyms = new TreeSet();
|
||||
for (int i=group.size(); --i >= 0; ) { // for each groupID of word
|
||||
ArrayList words = (ArrayList) group2Words.get(group.get(i));
|
||||
for (int j=words.size(); --j >= 0; ) { // add all words
|
||||
Object synonym = words.get(j); // note that w and word are interned
|
||||
if (synonym != word) { // a word is implicitly it's own synonym
|
||||
synonyms.add(synonym);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int size = synonyms.size();
|
||||
if (size > 0) {
|
||||
String[] syns = new String[size];
|
||||
if (size == 1)
|
||||
syns[0] = (String) synonyms.first();
|
||||
else
|
||||
synonyms.toArray(syns);
|
||||
// if (syns.length > 1) Arrays.sort(syns);
|
||||
// if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns));
|
||||
word2Syns.put(word, syns);
|
||||
}
|
||||
}
|
||||
|
||||
return word2Syns;
|
||||
}
|
||||
|
||||
private HashMap optimize(HashMap word2Syns, HashMap internedWords) {
|
||||
if (DEBUG) {
|
||||
System.err.println("before gc");
|
||||
for (int i=0; i < 10; i++) System.gc();
|
||||
System.err.println("after gc");
|
||||
}
|
||||
|
||||
// collect entries
|
||||
int len = 0;
|
||||
int size = word2Syns.size();
|
||||
String[][] allSynonyms = new String[size][];
|
||||
String[] words = new String[size];
|
||||
Iterator iter = word2Syns.entrySet().iterator();
|
||||
for (int j=0; j < size; j++) {
|
||||
Map.Entry entry = (Map.Entry) iter.next();
|
||||
allSynonyms[j] = (String[]) entry.getValue();
|
||||
words[j] = (String) entry.getKey();
|
||||
len += words[j].length();
|
||||
}
|
||||
|
||||
// assemble large string containing all words
|
||||
StringBuffer buf = new StringBuffer(len);
|
||||
for (int j=0; j < size; j++) buf.append(words[j]);
|
||||
String allWords = new String(buf.toString()); // ensure compact string across JDK versions
|
||||
buf = null;
|
||||
|
||||
// intern words at app level via memory-overlaid substrings
|
||||
for (int p=0, j=0; j < size; j++) {
|
||||
String word = words[j];
|
||||
internedWords.put(word, allWords.substring(p, p + word.length()));
|
||||
p += word.length();
|
||||
}
|
||||
|
||||
// replace words with interned words
|
||||
for (int j=0; j < size; j++) {
|
||||
String[] syns = allSynonyms[j];
|
||||
for (int k=syns.length; --k >= 0; ) {
|
||||
syns[k] = (String) internedWords.get(syns[k]);
|
||||
}
|
||||
Object replacement = syns;
|
||||
if (syns.length == 1) replacement = syns[0]; // minimize memory consumption some more
|
||||
word2Syns.remove(words[j]);
|
||||
word2Syns.put(internedWords.get(words[j]), replacement);
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
words = null;
|
||||
allSynonyms = null;
|
||||
internedWords = null;
|
||||
allWords = null;
|
||||
System.err.println("before gc");
|
||||
for (int i=0; i < 10; i++) System.gc();
|
||||
System.err.println("after gc");
|
||||
}
|
||||
return word2Syns;
|
||||
}
|
||||
|
||||
// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
|
||||
private static byte[] toByteArray(InputStream input) throws IOException {
|
||||
try {
|
||||
// safe and fast even if input.available() behaves weird or buggy
|
||||
int len = Math.max(256, input.available());
|
||||
byte[] buffer = new byte[len];
|
||||
byte[] output = new byte[len];
|
||||
|
||||
len = 0;
|
||||
int n;
|
||||
while ((n = input.read(buffer)) >= 0) {
|
||||
if (len + n > output.length) { // grow capacity
|
||||
byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
|
||||
System.arraycopy(output, 0, tmp, 0, len);
|
||||
System.arraycopy(buffer, 0, tmp, len, n);
|
||||
buffer = output; // use larger buffer for future larger bulk reads
|
||||
output = tmp;
|
||||
} else {
|
||||
System.arraycopy(buffer, 0, output, len, n);
|
||||
}
|
||||
len += n;
|
||||
}
|
||||
|
||||
if (len == output.length) return output;
|
||||
buffer = null; // help gc
|
||||
buffer = new byte[len];
|
||||
System.arraycopy(output, 0, buffer, 0, len);
|
||||
return buffer;
|
||||
} finally {
|
||||
if (input != null) input.close();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,134 @@
|
|||
package org.apache.lucene.index.memory;
|
||||
|
||||
/**
|
||||
* Copyright 2005 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* Injects additional tokens for synonyms of token terms fetched from the
|
||||
* underlying child stream; the child stream must deliver lowercase tokens
|
||||
* for synonyms to be found.
|
||||
*
|
||||
* @author whoschek.AT.lbl.DOT.gov
|
||||
*/
|
||||
public class SynonymTokenFilter extends TokenFilter {
|
||||
|
||||
/** The Token.type used to indicate a synonym to higher level filters. */
|
||||
public static final String SYNONYM_TOKEN_TYPE = "SYNONYM";
|
||||
|
||||
private final SynonymMap synonyms;
|
||||
private final int maxSynonyms;
|
||||
|
||||
private String[] stack = null;
|
||||
private int index = 0;
|
||||
private Token current = null;
|
||||
private int todo = 0;
|
||||
|
||||
/**
|
||||
* Creates an instance for the given underlying stream and synonym table.
|
||||
*
|
||||
* @param input
|
||||
* the underlying child token stream
|
||||
* @param synonyms
|
||||
* the map used to extract synonyms for terms
|
||||
* @param maxSynonyms
|
||||
* the maximum number of synonym tokens to return per underlying
|
||||
* token word (a value of Integer.MAX_VALUE indicates unlimited)
|
||||
*/
|
||||
public SynonymTokenFilter(TokenStream input, SynonymMap synonyms, int maxSynonyms) {
|
||||
super(input);
|
||||
if (input == null)
|
||||
throw new IllegalArgumentException("input must not be null");
|
||||
if (synonyms == null)
|
||||
throw new IllegalArgumentException("synonyms must not be null");
|
||||
if (maxSynonyms < 0)
|
||||
throw new IllegalArgumentException("maxSynonyms must not be negative");
|
||||
|
||||
this.synonyms = synonyms;
|
||||
this.maxSynonyms = maxSynonyms;
|
||||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
public Token next() throws IOException {
|
||||
Token token;
|
||||
while (todo > 0 && index < stack.length) { // pop from stack
|
||||
token = createToken(stack[index++], current);
|
||||
if (token != null) {
|
||||
todo--;
|
||||
return token;
|
||||
}
|
||||
}
|
||||
|
||||
token = input.next();
|
||||
if (token == null) return null; // EOS; iterator exhausted
|
||||
|
||||
stack = synonyms.getSynonyms(token.termText()); // push onto stack
|
||||
if (stack.length > maxSynonyms) randomize(stack);
|
||||
index = 0;
|
||||
current = token;
|
||||
todo = maxSynonyms;
|
||||
return token;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates and returns a token for the given synonym of the current input
|
||||
* token; Override for custom (stateless or stateful) behaviour, if desired.
|
||||
*
|
||||
* @param synonym
|
||||
* a synonym for the current token's term
|
||||
* @param current
|
||||
* the current token from the underlying child stream
|
||||
* @return a new token, or null to indicate that the given synonym should be
|
||||
* ignored
|
||||
*/
|
||||
protected Token createToken(String synonym, Token current) {
|
||||
Token token = new Token(
|
||||
synonym, current.startOffset(), current.endOffset(), SYNONYM_TOKEN_TYPE);
|
||||
token.setPositionIncrement(0);
|
||||
return token;
|
||||
}
|
||||
|
||||
/**
|
||||
* Randomize synonyms to later sample a subset. Uses constant random seed
|
||||
* for reproducability. Uses "DRand", a simple, fast, uniform pseudo-random
|
||||
* number generator with medium statistical quality (multiplicative
|
||||
* congruential method), producing integers in the range [Integer.MIN_VALUE,
|
||||
* Integer.MAX_VALUE].
|
||||
*/
|
||||
private static void randomize(Object[] arr) {
|
||||
int seed = 1234567; // constant
|
||||
int randomState = 4*seed + 1;
|
||||
// Random random = new Random(seed); // unnecessary overhead
|
||||
int len = arr.length;
|
||||
for (int i=0; i < len-1; i++) {
|
||||
randomState *= 0x278DDE6D; // z(i+1)=a*z(i) (mod 2**32)
|
||||
int r = randomState % (len-i);
|
||||
if (r < 0) r = -r; // e.g. -9 % 2 == -1
|
||||
// int r = random.nextInt(len-i);
|
||||
|
||||
// swap arr[i, i+r]
|
||||
Object tmp = arr[i];
|
||||
arr[i] = arr[i + r];
|
||||
arr[i + r] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue