From 8348498d08feec9a8e45ecf2cd9a1521dce8bfaa Mon Sep 17 00:00:00 2001 From: Yonik Seeley Date: Thu, 24 Jan 2008 04:41:38 +0000 Subject: [PATCH] SOLR-466: add CharArrayMap, update SynonymMap to use char[] rather than String Tokens git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@614793 13f79535-47bb-0310-9956-ffa450edef68 --- .../apache/solr/analysis/SynonymFilter.java | 57 +-- .../solr/analysis/SynonymFilterFactory.java | 19 +- .../org/apache/solr/analysis/SynonymMap.java | 54 ++- .../org/apache/solr/util/CharArrayMap.java | 411 ++++++++++++++++++ .../solr/analysis/TestSynonymFilter.java | 2 +- .../apache/solr/util/TestCharArrayMap.java | 208 +++++++++ 6 files changed, 692 insertions(+), 59 deletions(-) create mode 100755 src/java/org/apache/solr/util/CharArrayMap.java create mode 100755 src/test/org/apache/solr/util/TestCharArrayMap.java diff --git a/src/java/org/apache/solr/analysis/SynonymFilter.java b/src/java/org/apache/solr/analysis/SynonymFilter.java index 90a5af73891..1191aa767d7 100644 --- a/src/java/org/apache/solr/analysis/SynonymFilter.java +++ b/src/java/org/apache/solr/analysis/SynonymFilter.java @@ -39,13 +39,11 @@ import java.util.LinkedList; public class SynonymFilter extends TokenFilter { private final SynonymMap map; // Map - private final boolean ignoreCase; - private Iterator replacement; // iterator over generated tokens + private Iterator replacement; // iterator over generated tokens - public SynonymFilter(TokenStream in, SynonymMap map, boolean ignoreCase) { + public SynonymFilter(TokenStream in, SynonymMap map) { super(in); this.map = map; - this.ignoreCase = ignoreCase; } @@ -66,26 +64,26 @@ public class SynonymFilter extends TokenFilter { * merging token streams to preserve token positions. * - preserve original positionIncrement of first matched token */ - public Token next() throws IOException { + @Override + public Token next(Token target) throws IOException { while (true) { // if there are any generated tokens, return them... don't try any // matches against them, as we specifically don't want recursion. if (replacement!=null && replacement.hasNext()) { - return (Token)replacement.next(); + return replacement.next(); } // common case fast-path of first token not matching anything - Token firstTok = nextTok(); - if (firstTok ==null) return null; - String str = ignoreCase ? firstTok.termText().toLowerCase() : firstTok.termText(); - Object o = map.submap!=null ? map.submap.get(str) : null; - if (o == null) return firstTok; + Token firstTok = nextTok(target); + if (firstTok == null) return null; + SynonymMap result = map.submap!=null ? map.submap.get(firstTok.termBuffer(), 0, firstTok.termLength()) : null; + if (result == null) return firstTok; // OK, we matched a token, so find the longest match. - matched = new LinkedList(); + matched = new LinkedList(); - SynonymMap result = match((SynonymMap)o); + result = match(result); if (result==null) { // no match, simply return the first token read. @@ -93,13 +91,13 @@ public class SynonymFilter extends TokenFilter { } // reuse, or create new one each time? - ArrayList generated = new ArrayList(result.synonyms.length + matched.size() + 1); + ArrayList generated = new ArrayList(result.synonyms.length + matched.size() + 1); // // there was a match... let's generate the new tokens, merging // in the matched tokens (position increments need adjusting) // - Token lastTok = matched.isEmpty() ? firstTok : (Token)matched.getLast(); + Token lastTok = matched.isEmpty() ? firstTok : matched.getLast(); boolean includeOrig = result.includeOrig(); Token origTok = includeOrig ? firstTok : null; @@ -109,7 +107,8 @@ public class SynonymFilter extends TokenFilter { for (int i=0; i buffer; + private LinkedList matched; private Token nextTok() throws IOException { if (buffer!=null && !buffer.isEmpty()) { - return (Token)buffer.removeFirst(); + return buffer.removeFirst(); } else { return input.next(); } } + private Token nextTok(Token target) throws IOException { + if (buffer!=null && !buffer.isEmpty()) { + return buffer.removeFirst(); + } else { + return input.next(target); + } + } + private void pushTok(Token t) { - if (buffer==null) buffer=new LinkedList(); + if (buffer==null) buffer=new LinkedList(); buffer.addFirst(t); } @@ -177,9 +182,7 @@ public class SynonymFilter extends TokenFilter { Token tok = nextTok(); if (tok != null) { // check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level? - String str = ignoreCase ? tok.termText().toLowerCase() : tok.termText(); - - SynonymMap subMap = (SynonymMap)map.submap.get(str); + SynonymMap subMap = map.submap.get(tok.termBuffer(), 0, tok.termLength()); if (subMap != null) { // recurse diff --git a/src/java/org/apache/solr/analysis/SynonymFilterFactory.java b/src/java/org/apache/solr/analysis/SynonymFilterFactory.java index 8a25ded32c5..c4a0cec559e 100644 --- a/src/java/org/apache/solr/analysis/SynonymFilterFactory.java +++ b/src/java/org/apache/solr/analysis/SynonymFilterFactory.java @@ -35,8 +35,8 @@ public class SynonymFilterFactory extends BaseTokenFilterFactory implements Reso public void inform(ResourceLoader loader) { String synonyms = args.get("synonyms"); - ignoreCase = getBoolean("ignoreCase",false); - expand = getBoolean("expand",true); + boolean ignoreCase = getBoolean("ignoreCase", false); + boolean expand = getBoolean("expand", true); if (synonyms != null) { List wlist=null; @@ -45,8 +45,8 @@ public class SynonymFilterFactory extends BaseTokenFilterFactory implements Reso } catch (IOException e) { throw new RuntimeException(e); } - synMap = new SynonymMap(); - parseRules(wlist, synMap, "=>", ",", ignoreCase,expand); + synMap = new SynonymMap(ignoreCase); + parseRules(wlist, synMap, "=>", ",", expand); if (wlist.size()<=20) { SolrCore.log.fine("SynonymMap "+synonyms +":"+synMap); } @@ -54,10 +54,8 @@ public class SynonymFilterFactory extends BaseTokenFilterFactory implements Reso } private SynonymMap synMap; - private boolean ignoreCase; - private boolean expand; - private static void parseRules(List rules, SynonymMap map, String mappingSep, String synSep, boolean ignoreCase, boolean expansion) { + private static void parseRules(List rules, SynonymMap map, String mappingSep, String synSep, boolean expansion) { int count=0; for (String rule : rules) { // To use regexes, we need an expression that specifies an odd number of chars. @@ -91,10 +89,11 @@ public class SynonymFilterFactory extends BaseTokenFilterFactory implements Reso for (List fromToks : source) { count++; for (List toToks : target) { - map.add(ignoreCase ? StrUtils.toLower(fromToks) : fromToks, + map.add(fromToks, SynonymMap.makeTokens(toToks), includeOrig, - true); + true + ); } } } @@ -114,7 +113,7 @@ public class SynonymFilterFactory extends BaseTokenFilterFactory implements Reso public SynonymFilter create(TokenStream input) { - return new SynonymFilter(input,synMap,ignoreCase); + return new SynonymFilter(input,synMap); } diff --git a/src/java/org/apache/solr/analysis/SynonymMap.java b/src/java/org/apache/solr/analysis/SynonymMap.java index 8488529ca2c..2e8bec5052b 100644 --- a/src/java/org/apache/solr/analysis/SynonymMap.java +++ b/src/java/org/apache/solr/analysis/SynonymMap.java @@ -18,6 +18,7 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.Token; +import org.apache.solr.util.CharArrayMap; import java.util.*; @@ -26,13 +27,20 @@ import java.util.*; * @version $Id$ */ public class SynonymMap { - Map submap; // recursive: Map + CharArrayMap submap; // recursive: Map Token[] synonyms; int flags; static final int INCLUDE_ORIG=0x01; + static final int IGNORE_CASE=0x02; + + public SynonymMap() {} + public SynonymMap(boolean ignoreCase) { + if (ignoreCase) flags |= IGNORE_CASE; + } public boolean includeOrig() { return (flags & INCLUDE_ORIG) != 0; } + public boolean ignoreCase() { return (flags & IGNORE_CASE) != 0; } /** * @param singleMatch List, the sequence of strings to match @@ -40,17 +48,17 @@ public class SynonymMap { * @param includeOrig sets a flag on this mapping signaling the generation of matched tokens in addition to the replacement tokens * @param mergeExisting merge the replacement tokens with any other mappings that exist */ - public void add(List singleMatch, List replacement, boolean includeOrig, boolean mergeExisting) { + public void add(List singleMatch, List replacement, boolean includeOrig, boolean mergeExisting) { SynonymMap currMap = this; - for (Iterator iter = singleMatch.iterator(); iter.hasNext();) { - String str = (String)iter.next(); + for (String str : singleMatch) { if (currMap.submap==null) { - currMap.submap = new HashMap(1); + currMap.submap = new CharArrayMap(1, ignoreCase()); } - SynonymMap map = (SynonymMap)currMap.submap.get(str); + SynonymMap map = currMap.submap.get(str); if (map==null) { map = new SynonymMap(); + map.flags |= flags & IGNORE_CASE; currMap.submap.put(str, map); } @@ -68,7 +76,7 @@ public class SynonymMap { public String toString() { - StringBuffer sb = new StringBuffer("<"); + StringBuilder sb = new StringBuilder("<"); if (synonyms!=null) { sb.append("["); for (int i=0; i from a List */ - public static List makeTokens(List strings) { - List ret = new ArrayList(strings.size()); - for (Iterator iter = strings.iterator(); iter.hasNext();) { - Token newTok = new Token((String)iter.next(),0,0,"SYNONYM"); + public static List makeTokens(List strings) { + List ret = new ArrayList(strings.size()); + for (String str : strings) { + //Token newTok = new Token(str,0,0,"SYNONYM"); + Token newTok = new Token(0,0,"SYNONYM"); + newTok.setTermBuffer(str.toCharArray(), 0, str.length()); ret.add(newTok); } return ret; @@ -106,8 +116,8 @@ public class SynonymMap { * Example: [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2] (a,n means a has posInc=n) * */ - public static List mergeTokens(List lst1, List lst2) { - ArrayList result = new ArrayList(); + public static List mergeTokens(List lst1, List lst2) { + ArrayList result = new ArrayList(); if (lst1 ==null || lst2 ==null) { if (lst2 != null) result.addAll(lst2); if (lst1 != null) result.addAll(lst1); @@ -115,27 +125,29 @@ public class SynonymMap { } int pos=0; - Iterator iter1=lst1.iterator(); - Iterator iter2=lst2.iterator(); - Token tok1 = iter1.hasNext() ? (Token)iter1.next() : null; - Token tok2 = iter2.hasNext() ? (Token)iter2.next() : null; + Iterator iter1=lst1.iterator(); + Iterator iter2=lst2.iterator(); + Token tok1 = iter1.hasNext() ? iter1.next() : null; + Token tok2 = iter2.hasNext() ? iter2.next() : null; int pos1 = tok1!=null ? tok1.getPositionIncrement() : 0; int pos2 = tok2!=null ? tok2.getPositionIncrement() : 0; while(tok1!=null || tok2!=null) { while (tok1 != null && (pos1 <= pos2 || tok2==null)) { - Token tok = new Token(tok1.termText(), tok1.startOffset(), tok1.endOffset(), tok1.type()); + Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type()); + tok.setTermBuffer(tok1.termBuffer(), 0, tok1.termLength()); tok.setPositionIncrement(pos1-pos); result.add(tok); pos=pos1; - tok1 = iter1.hasNext() ? (Token)iter1.next() : null; + tok1 = iter1.hasNext() ? iter1.next() : null; pos1 += tok1!=null ? tok1.getPositionIncrement() : 0; } while (tok2 != null && (pos2 <= pos1 || tok1==null)) { - Token tok = new Token(tok2.termText(), tok2.startOffset(), tok2.endOffset(), tok2.type()); + Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type()); + tok.setTermBuffer(tok2.termBuffer(), 0, tok2.termLength()); tok.setPositionIncrement(pos2-pos); result.add(tok); pos=pos2; - tok2 = iter2.hasNext() ? (Token)iter2.next() : null; + tok2 = iter2.hasNext() ? iter2.next() : null; pos2 += tok2!=null ? tok2.getPositionIncrement() : 0; } } diff --git a/src/java/org/apache/solr/util/CharArrayMap.java b/src/java/org/apache/solr/util/CharArrayMap.java new file mode 100755 index 00000000000..f0d1bb2f2bb --- /dev/null +++ b/src/java/org/apache/solr/util/CharArrayMap.java @@ -0,0 +1,411 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.util; + +import java.util.*; +import java.io.Serializable; + +/** + * A simple class that stores key Strings as char[]'s in a + * hash table. Note that this is not a general purpose + * class. For example, it cannot remove items from the + * map, nor does it resize its hash table to be smaller, + * etc. It is designed to be quick to retrieve items + * by char[] keys without the necessity of converting + * to a String first. + */ + +public class CharArrayMap extends AbstractMap + implements Map, Cloneable, Serializable +{ + private final static int INIT_SIZE = 2; + private char[][] keys; + private Object[] values; + private int count; + private final boolean ignoreCase; + + /** Create map with enough capacity to hold startSize + * terms */ + public CharArrayMap(int initialCapacity, boolean ignoreCase) { + this.ignoreCase = ignoreCase; + int size = INIT_SIZE; + // load factor of .75, inverse is 1.25, or x+x/4 + initialCapacity = initialCapacity + (initialCapacity >>2); + while(size <= initialCapacity) + size <<= 1; + keys = new char[size][]; + values = new Object[size]; + } + + public boolean ignoreCase() { + return ignoreCase; + } + + public V get(char[] key) { + return get(key, 0, key.length); + } + + public V get(char[] key, int off, int len) { + return (V)values[getSlot(key, off, len)]; + } + + public V get(CharSequence key) { + return (V)values[getSlot(key)]; + } + + @Override + public V get(Object key) { + return (V)values[getSlot(key)]; + } + + @Override + public boolean containsKey(Object s) { + return keys[getSlot(s)] != null; + } + + @Override + public boolean containsValue(Object value) { + if (value == null) { + // search for key with a null value + for (int i=0; i>8)+code)|1; + do { + code += inc; + pos = code & (keys.length-1); + key2 = keys[pos]; + } while (key2 != null && !equals(key, off, len, key2)); + } + return pos; + } + + /** Returns true if the String is in the set */ + private int getSlot(CharSequence key) { + int code = getHashCode(key); + int pos = code & (keys.length-1); + char[] key2 = keys[pos]; + if (key2 != null && !equals(key, key2)) { + final int inc = ((code>>8)+code)|1; + do { + code += inc; + pos = code & (keys.length-1); + key2 = keys[pos]; + } while (key2 != null && !equals(key, key2)); + } + return pos; + } + + public V put(CharSequence key, V val) { + return put(key.toString(), val); // could be more efficient + } + + @Override + public V put(String key, V val) { + return put(key.toCharArray(), val); + } + + /** Add this key,val pair to the map. + * The char[] key is directly used, no copy is made. + * If ignoreCase is true for this Map, the key array will be directly modified. + * The user should never modify the key after calling this method. + */ + public V put(char[] key, Object val) { + if (ignoreCase) + for(int i=0;i< key.length;i++) + key[i] = Character.toLowerCase(key[i]); + int slot = getSlot(key, 0, key.length); + if (keys[slot] == null) count++; + Object prev = values[slot]; + keys[slot] = key; + values[slot] = val; + + if (count + (count>>2) >= keys.length) { + rehash(); + } + + return (V)prev; + } + + + private boolean equals(char[] text1, int off, int len, char[] text2) { + if (len != text2.length) + return false; + if (ignoreCase) { + for(int i=0;i> entrySet() { + return new EntrySet(); + } + + /** Returns an EntryIterator over this Map. */ + public EntryIterator iterator() { + return new EntryIterator(); + } + + /** public iterator class so efficient methods are exposed to users */ + public class EntryIterator implements Iterator> { + int pos=-1; + int lastPos; + + EntryIterator() { + goNext(); + } + + private void goNext() { + lastPos = pos; + pos++; + while (pos < keys.length && keys[pos] == null) pos++; + } + + public boolean hasNext() { + return pos < keys.length; + } + + /** gets the next key... do not modify the returned char[] */ + public char[] nextKey() { + goNext(); + return keys[lastPos]; + } + + /** gets the next key as a newly created String object */ + public String nextKeyString() { + return new String(nextKey()); + } + + /** returns the value associated with the last key returned */ + public V currentValue() { + return (V)values[lastPos]; + } + + /** sets the value associated with the last key returned */ + public V setValue(V value) { + V old = (V)values[lastPos]; + values[lastPos] = value; + return old; + } + + /** Returns an Entry object created on the fly... + * use nextCharArray() + currentValie() for better efficiency. */ + public Map.Entry next() { + goNext(); + return new MapEntry(lastPos); + } + + public void remove() { + throw new UnsupportedOperationException(); + } + } + + + private class MapEntry implements Map.Entry { + final int pos; + + MapEntry(int pos) { + this.pos = pos; + } + + public char[] getCharArr() { + return keys[pos]; + } + + public String getKey() { + return new String(getCharArr()); + } + + public V getValue() { + return (V)values[pos]; + } + + public V setValue(V value) { + V old = (V)values[pos]; + values[pos] = value; + return old; + } + + public String toString() { + return getKey() + '=' + getValue(); + } + } + + + + private class EntrySet extends AbstractSet> { + public EntryIterator iterator() { + return new EntryIterator(); + } + public boolean contains(Object o) { + if (!(o instanceof Map.Entry)) + return false; + Map.Entry e = (Map.Entry)o; + Object key = e.getKey(); + if (key==null) return false; // we don't support null keys + Object val = e.getValue(); + Object v = get(key); + return v==null ? val==null : v.equals(val); + } + public boolean remove(Object o) { + throw new UnsupportedOperationException(); + } + public int size() { + return count; + } + public void clear() { + CharArrayMap.this.clear(); + } + } + + @Override + public Object clone() { + CharArrayMap map = null; + try { + map = (CharArrayMap)super.clone(); + map.keys = keys.clone(); + map.values = keys.clone(); + } catch (CloneNotSupportedException e) { + // impossible + } + return map; + } +} diff --git a/src/test/org/apache/solr/analysis/TestSynonymFilter.java b/src/test/org/apache/solr/analysis/TestSynonymFilter.java index 60e7b5f8789..3f1cb5d5785 100644 --- a/src/test/org/apache/solr/analysis/TestSynonymFilter.java +++ b/src/test/org/apache/solr/analysis/TestSynonymFilter.java @@ -48,7 +48,7 @@ public class TestSynonymFilter extends BaseTokenTestCase { } }; - SynonymFilter sf = new SynonymFilter(ts, dict, true); + SynonymFilter sf = new SynonymFilter(ts, dict); while(true) { Token t = sf.next(); diff --git a/src/test/org/apache/solr/util/TestCharArrayMap.java b/src/test/org/apache/solr/util/TestCharArrayMap.java new file mode 100755 index 00000000000..d6d767e1a1d --- /dev/null +++ b/src/test/org/apache/solr/util/TestCharArrayMap.java @@ -0,0 +1,208 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.util; + +import junit.framework.TestCase; + +import java.util.*; + +import org.apache.lucene.analysis.StopAnalyzer; + +public class TestCharArrayMap extends TestCase { + Random r = new Random(0); + + public void doRandom(int iter, boolean ignoreCase) { + CharArrayMap map = new CharArrayMap(1,ignoreCase); + HashMap hmap = new HashMap(); + + char[] key; + for (int i=0; i cm = new CharArrayMap(2,false); + HashMap hm = new HashMap(); + hm.put("foo",1); + hm.put("bar",2); + cm.putAll(hm); + assertEquals(hm, cm); + assertEquals(cm, hm); + hm.put("baz", 3); + assertFalse(hm.equals(cm)); + assertFalse(cm.equals(hm)); + assertTrue(cm.equals(cm)); + cm.putAll(hm); + assertEquals(hm, cm); + + Iterator> iter1 = cm.entrySet().iterator(); + int n=0; + while (iter1.hasNext()) { + Map.Entry entry = iter1.next(); + String key = entry.getKey(); + Integer val = entry.getValue(); + assertEquals(hm.get(key), val); + entry.setValue(val*100); + assertEquals(val*100, (int)cm.get(key)); + n++; + } + assertEquals(hm.size(), n); + cm.clear(); + cm.putAll(hm); + + CharArrayMap.EntryIterator iter2 = cm.iterator(); + n=0; + while (iter2.hasNext()) { + char[] keyc = iter2.nextKey(); + Integer val = iter2.currentValue(); + assertEquals(hm.get(new String(keyc)), val); + iter2.setValue(val*100); + assertEquals(val*100, (int)cm.get(keyc)); + n++; + } + assertEquals(hm.size(), n); + + cm.clear(); + assertEquals(0, cm.size()); + assertTrue(cm.isEmpty()); + } + + + // performance test vs HashMap + // HashMap will have an edge because we are testing with + // non-dynamically created keys and String caches hashCode + public static void main(String[] args) { + int a=0; + String impl = args[a++].intern(); // hash OR chars OR char + int iter1 = Integer.parseInt(args[a++]); // iterations of put() + int iter2 = Integer.parseInt(args[a++]); // iterations of get() + + int ret=0; + long start = System.currentTimeMillis(); + String[] stopwords = StopAnalyzer.ENGLISH_STOP_WORDS; + // words = "this is a different test to see what is really going on here... I hope it works well but I'm not sure it will".split(" "); + char[][] stopwordschars = new char[stopwords.length][]; + for (int i=0; i hm=null; + CharArrayMap cm=null; + + if (impl=="hash") { + for (int i=0; i(); + int v=0; + for (String word : stopwords) { + hm.put(word, ++v); + } + ret += hm.size(); + } + } else if (impl=="chars") { + for (int i=0; i(2,false); + int v=0; + for (String s : stopwords) { + cm.put(s,++v); + } + ret += cm.size(); + } + } else if (impl=="char") { + for (int i=0; i(2,false); + int v=0; + for (char[] s : stopwordschars) { + cm.put(s,++v); + } + ret += cm.size(); + } + } + + + if (impl=="hash") { + for (int i=0; i