diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java index 04b7368637c..0d56ce960fd 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java @@ -18,13 +18,13 @@ package org.apache.lucene.analysis.charfilter; import java.io.IOException; import java.io.Reader; -import java.util.Map; import org.apache.lucene.analysis.CharFilter; // javadocs import org.apache.lucene.analysis.util.RollingCharBuffer; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.fst.CharSequenceOutputs; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.Outputs; +import org.apache.lucene.util.hppc.CharObjectHashMap; /** * Simplistic {@link CharFilter} that applies the mappings contained in a {@link NormalizeCharMap} @@ -38,7 +38,7 @@ public class MappingCharFilter extends BaseCharFilter { private final FST.BytesReader fstReader; private final RollingCharBuffer buffer = new RollingCharBuffer(); private final FST.Arc scratchArc = new FST.Arc<>(); - private final Map> cachedRootArcs; + private final CharObjectHashMap> cachedRootArcs; private CharsRef replacement; private int replacementPointer; @@ -96,7 +96,7 @@ public class MappingCharFilter extends BaseCharFilter { final int firstCH = buffer.get(inputOff); if (firstCH != -1) { - FST.Arc arc = cachedRootArcs.get(Character.valueOf((char) firstCH)); + FST.Arc arc = cachedRootArcs.get((char) firstCH); if (arc != null) { if (!FST.targetHasArcs(arc)) { // Fast pass for single character match: diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java index 1ffa071835e..ef0b0141ac8 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java @@ -17,7 +17,6 @@ package org.apache.lucene.analysis.charfilter; import java.io.IOException; -import java.util.HashMap; import java.util.Map; import java.util.TreeMap; import org.apache.lucene.util.CharsRef; @@ -27,6 +26,7 @@ import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.Outputs; import org.apache.lucene.util.fst.Util; +import org.apache.lucene.util.hppc.CharObjectHashMap; // TODO: save/load? @@ -37,7 +37,7 @@ import org.apache.lucene.util.fst.Util; public class NormalizeCharMap { final FST map; - final Map> cachedRootArcs = new HashMap<>(); + final CharObjectHashMap> cachedRootArcs = new CharObjectHashMap<>(); // Use the builder to create: private NormalizeCharMap(FST map) { @@ -53,8 +53,7 @@ public class NormalizeCharMap { while (true) { assert scratchArc.label() != FST.END_LABEL; cachedRootArcs.put( - Character.valueOf((char) scratchArc.label()), - new FST.Arc().copyFrom(scratchArc)); + (char) scratchArc.label(), new FST.Arc().copyFrom(scratchArc)); if (scratchArc.isLast()) { break; } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index f93d59e9bbd..d8aac0c5066 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -41,7 +41,6 @@ import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; -import java.util.HashSet; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; @@ -49,7 +48,6 @@ import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.stream.Collectors; -import java.util.stream.Stream; import org.apache.lucene.analysis.hunspell.SortingStrategy.EntryAccumulator; import org.apache.lucene.analysis.hunspell.SortingStrategy.EntrySupplier; import org.apache.lucene.store.Directory; @@ -60,6 +58,7 @@ import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.IntSequenceOutputs; import org.apache.lucene.util.fst.Util; +import org.apache.lucene.util.hppc.CharHashSet; import org.apache.lucene.util.hppc.IntArrayList; import org.apache.lucene.util.hppc.IntCursor; @@ -334,8 +333,8 @@ public class Dictionary { throws IOException, ParseException { TreeMap prefixes = new TreeMap<>(); TreeMap suffixes = new TreeMap<>(); - Set prefixContFlags = new HashSet<>(); - Set suffixContFlags = new HashSet<>(); + CharHashSet prefixContFlags = new CharHashSet(); + CharHashSet suffixContFlags = new CharHashSet(); Map seenPatterns = new HashMap<>(); // zero condition -> 0 ord @@ -673,7 +672,7 @@ public class Dictionary { */ private void parseAffix( TreeMap affixes, - Set secondStageFlags, + CharHashSet secondStageFlags, String header, LineNumberReader reader, AffixKind kind, @@ -1178,10 +1177,14 @@ public class Dictionary { } char[] allNonSuggestibleFlags() { - return Dictionary.toSortedCharArray( - Stream.of(HIDDEN_FLAG, noSuggest, forbiddenword, onlyincompound, subStandard) - .filter(c -> c != FLAG_UNSET) - .collect(Collectors.toSet())); + CharHashSet set = new CharHashSet(5); + set.add(HIDDEN_FLAG); + for (char c : new char[] {noSuggest, forbiddenword, onlyincompound, subStandard}) { + if (c != FLAG_UNSET) { + set.add(c); + } + } + return Dictionary.toSortedCharArray(set); } private List readMorphFields(String word, String unparsed) { @@ -1538,12 +1541,8 @@ public class Dictionary { return reuse; } - static char[] toSortedCharArray(Set set) { - char[] chars = new char[set.size()]; - int i = 0; - for (Character c : set) { - chars[i++] = c; - } + static char[] toSortedCharArray(CharHashSet set) { + char[] chars = set.toArray(); Arrays.sort(chars); return chars; } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java index 2e0799f4caa..c2e9d05ec0d 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordFormGenerator.java @@ -42,6 +42,8 @@ import org.apache.lucene.analysis.hunspell.AffixedWord.Affix; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.IntsRefFSTEnum; +import org.apache.lucene.util.hppc.CharHashSet; +import org.apache.lucene.util.hppc.CharObjectHashMap; /** * A utility class used for generating possible word forms by adding affixes to stems ({@link @@ -50,7 +52,7 @@ import org.apache.lucene.util.fst.IntsRefFSTEnum; */ public class WordFormGenerator { private final Dictionary dictionary; - private final Map> affixes = new HashMap<>(); + private final CharObjectHashMap> affixes = new CharObjectHashMap<>(); private final Stemmer stemmer; public WordFormGenerator(Dictionary dictionary) { @@ -75,7 +77,15 @@ public class WordFormGenerator { char flag = dictionary.affixData(id, AFFIX_FLAG); var entry = new AffixEntry(id, flag, kind, toString(kind, io.input), strip(id), condition(id)); - affixes.computeIfAbsent(flag, __ -> new ArrayList<>()).add(entry); + List entries; + int index = affixes.indexOf(flag); + if (index < 0) { + entries = new ArrayList<>(); + affixes.indexInsert(index, flag, entries); + } else { + entries = affixes.indexGet(index); + } + entries.add(entry); } } } catch (IOException e) { @@ -162,11 +172,7 @@ public class WordFormGenerator { } private static char[] deduplicate(char[] flags) { - Set set = new HashSet<>(); - for (char flag : flags) { - set.add(flag); - } - return toSortedCharArray(set); + return toSortedCharArray(CharHashSet.from(flags)); } /** @@ -408,7 +414,7 @@ public class WordFormGenerator { int innerSuffix) { String candidate = new String(word, offset, length); stemCounts.merge(candidate, 1, Integer::sum); - Set flags = new LinkedHashSet<>(); + CharHashSet flags = new CharHashSet(); if (outerPrefix >= 0) flags.add(dictionary.affixData(outerPrefix, AFFIX_FLAG)); if (innerPrefix >= 0) flags.add(dictionary.affixData(innerPrefix, AFFIX_FLAG)); if (outerSuffix >= 0) flags.add(dictionary.affixData(outerSuffix, AFFIX_FLAG)); @@ -479,7 +485,7 @@ public class WordFormGenerator { if (wordSet.contains(extra)) continue; if (forbidden.contains(extra) && dictionary.forbiddenword != FLAG_UNSET) { - addEntry(toEdit, toAdd, extra, Set.of(dictionary.forbiddenword)); + addEntry(toEdit, toAdd, extra, CharHashSet.from(dictionary.forbiddenword)); } else { extraGenerated.add(extra); } @@ -489,7 +495,7 @@ public class WordFormGenerator { } private void addEntry( - List toEdit, List toAdd, String stem, Set flags) { + List toEdit, List toAdd, String stem, CharHashSet flags) { String flagString = toFlagString(flags); (existingStems.contains(stem) ? toEdit : toAdd).add(DictEntry.create(stem, flagString)); } @@ -529,18 +535,20 @@ public class WordFormGenerator { .flatMap(swc -> expansionCache.computeIfAbsent(swc, expandToWords).stream()); } - private List expand(String stem, Set flagSet) { + private List expand(String stem, CharHashSet flagSet) { return getAllWordForms(stem, toFlagString(flagSet), checkCanceled); } - private String toFlagString(Set flagSet) { + private String toFlagString(CharHashSet flagSet) { return dictionary.flagParsingStrategy.printFlags(Dictionary.toSortedCharArray(flagSet)); } } - private record FlagSet(Set flags, Dictionary dictionary) { - static Set flatten(Set flagSets) { - return flagSets.stream().flatMap(f -> f.flags.stream()).collect(Collectors.toSet()); + private record FlagSet(CharHashSet flags, Dictionary dictionary) { + static CharHashSet flatten(Set flagSets) { + CharHashSet set = new CharHashSet(flagSets.size() << 1); + flagSets.forEach(flagSet -> set.addAll(flagSet.flags)); + return set; } @Override diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseFilterUtil.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseFilterUtil.java new file mode 100644 index 00000000000..36410b3ea31 --- /dev/null +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseFilterUtil.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.ja; + +import java.util.Map; +import org.apache.lucene.util.hppc.CharObjectHashMap; + +/** Utility methods for Japanese filters. */ +class JapaneseFilterUtil { + + /** Creates a primitive char-to-char map from a set of {@link java.util.Map.Entry}. */ + @SafeVarargs + static CharObjectHashMap createCharMap( + Map.Entry... charMappings) { + CharObjectHashMap map = new CharObjectHashMap<>(charMappings.length); + for (Map.Entry charMapping : charMappings) { + map.put(charMapping.getKey(), charMapping.getValue()); + } + return map; + } +} diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseHiraganaUppercaseFilter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseHiraganaUppercaseFilter.java index b5078b73607..e7c0969aed5 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseHiraganaUppercaseFilter.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseHiraganaUppercaseFilter.java @@ -16,11 +16,14 @@ */ package org.apache.lucene.analysis.ja; +import static org.apache.lucene.analysis.ja.JapaneseFilterUtil.createCharMap; + import java.io.IOException; import java.util.Map; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.util.hppc.CharObjectHashMap; /** * A {@link TokenFilter} that normalizes small letters (捨て仮名) in hiragana into normal letters. For @@ -30,13 +33,13 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; * legal, contract policies, etc. */ public final class JapaneseHiraganaUppercaseFilter extends TokenFilter { - private static final Map LETTER_MAPPINGS; + private static final CharObjectHashMap LETTER_MAPPINGS; static { // supported characters are: // ぁ ぃ ぅ ぇ ぉ っ ゃ ゅ ょ ゎ ゕ ゖ LETTER_MAPPINGS = - Map.ofEntries( + createCharMap( Map.entry('ぁ', 'あ'), Map.entry('ぃ', 'い'), Map.entry('ぅ', 'う'), @@ -59,17 +62,16 @@ public final class JapaneseHiraganaUppercaseFilter extends TokenFilter { @Override public boolean incrementToken() throws IOException { - if (input.incrementToken()) { - char[] termBuffer = termAttr.buffer(); - for (int i = 0; i < termBuffer.length; i++) { - Character c = LETTER_MAPPINGS.get(termBuffer[i]); - if (c != null) { - termBuffer[i] = c; - } - } - return true; - } else { + if (!input.incrementToken()) { return false; } + final char[] termBuffer = termAttr.buffer(); + for (int i = 0, length = termAttr.length(); i < length; i++) { + Character c = LETTER_MAPPINGS.get(termBuffer[i]); + if (c != null) { + termBuffer[i] = c; + } + } + return true; } } diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaUppercaseFilter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaUppercaseFilter.java index 5e05714d1c3..7a96acd4c0c 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaUppercaseFilter.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaUppercaseFilter.java @@ -16,11 +16,14 @@ */ package org.apache.lucene.analysis.ja; +import static org.apache.lucene.analysis.ja.JapaneseFilterUtil.createCharMap; + import java.io.IOException; import java.util.Map; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.util.hppc.CharObjectHashMap; /** * A {@link TokenFilter} that normalizes small letters (捨て仮名) in katakana into normal letters. For @@ -30,13 +33,13 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; * legal, contract policies, etc. */ public final class JapaneseKatakanaUppercaseFilter extends TokenFilter { - private static final Map LETTER_MAPPINGS; + private static final CharObjectHashMap LETTER_MAPPINGS; static { // supported characters are: // ァ ィ ゥ ェ ォ ヵ ㇰ ヶ ㇱ ㇲ ッ ㇳ ㇴ ㇵ ㇶ ㇷ ㇷ゚ ㇸ ㇹ ㇺ ャ ュ ョ ㇻ ㇼ ㇽ ㇾ ㇿ ヮ LETTER_MAPPINGS = - Map.ofEntries( + createCharMap( Map.entry('ァ', 'ア'), Map.entry('ィ', 'イ'), Map.entry('ゥ', 'ウ'), @@ -75,22 +78,24 @@ public final class JapaneseKatakanaUppercaseFilter extends TokenFilter { @Override public boolean incrementToken() throws IOException { - if (input.incrementToken()) { - String term = termAttr.toString(); - if (term.contains("ㇷ゚")) { - term = term.replace("ㇷ゚", "プ"); - termAttr.setEmpty().append(term); - } - char[] termBuffer = termAttr.buffer(); - for (int i = 0; i < termBuffer.length; i++) { - Character c = LETTER_MAPPINGS.get(termBuffer[i]); - if (c != null) { - termBuffer[i] = c; - } - } - return true; - } else { + if (!input.incrementToken()) { return false; } + final char[] termBuffer = termAttr.buffer(); + int newLength = termAttr.length(); + for (int from = 0, to = 0, length = newLength; from < length; from++, to++) { + char c = termBuffer[from]; + if (c == 'ㇷ' && from + 1 < length && termBuffer[from + 1] == '゚') { + // ㇷ゚detected, replace it by プ. + termBuffer[to] = 'プ'; + from++; + newLength--; + } else { + Character mappedChar = LETTER_MAPPINGS.get(c); + termBuffer[to] = mappedChar == null ? c : mappedChar; + } + } + termAttr.setLength(newLength); + return true; } } diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaUppercaseFilter.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaUppercaseFilter.java index 30039305797..fffe62f0ebf 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaUppercaseFilter.java +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaUppercaseFilter.java @@ -65,6 +65,7 @@ public class TestJapaneseKatakanaUppercaseFilter extends BaseTokenStreamTestCase new String[] {"アイウエオカクケシスツトヌハヒフプヘホムヤユヨラリルレロワ"}); assertAnalyzesTo(keywordAnalyzer, "ストップウォッチ", new String[] {"ストツプウオツチ"}); assertAnalyzesTo(keywordAnalyzer, "サラニㇷ゚ カムイチェㇷ゚ ㇷ゚ㇷ゚", new String[] {"サラニプ", "カムイチエプ", "ププ"}); + assertAnalyzesTo(keywordAnalyzer, "カムイチェㇷ゚カムイチェ", new String[] {"カムイチエプカムイチエ"}); } public void testKanaUppercaseWithSurrogatePair() throws IOException { diff --git a/lucene/analysis/stempel/src/java/org/egothor/stemmer/Gener.java b/lucene/analysis/stempel/src/java/org/egothor/stemmer/Gener.java index 8b2144d56f1..b66f6f79656 100644 --- a/lucene/analysis/stempel/src/java/org/egothor/stemmer/Gener.java +++ b/lucene/analysis/stempel/src/java/org/egothor/stemmer/Gener.java @@ -58,6 +58,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; +import org.apache.lucene.util.hppc.ObjectCursor; /** * The Gener object helps in the discarding of nodes which break the reduction effort and defend the @@ -103,8 +104,8 @@ public class Gener extends Reduce { */ public boolean eat(Row in, int[] remap) { int sum = 0; - for (Iterator i = in.cells.values().iterator(); i.hasNext(); ) { - Cell c = i.next(); + for (Iterator> i = in.cells.values().iterator(); i.hasNext(); ) { + Cell c = i.next().value; sum += c.cnt; if (c.ref >= 0) { if (remap[c.ref] == 0) { @@ -114,8 +115,8 @@ public class Gener extends Reduce { } int frame = sum / 10; boolean live = false; - for (Iterator i = in.cells.values().iterator(); i.hasNext(); ) { - Cell c = i.next(); + for (Iterator> i = in.cells.values().iterator(); i.hasNext(); ) { + Cell c = i.next().value; if (c.cnt < frame && c.cmd >= 0) { c.cnt = 0; c.cmd = -1; diff --git a/lucene/analysis/stempel/src/java/org/egothor/stemmer/Lift.java b/lucene/analysis/stempel/src/java/org/egothor/stemmer/Lift.java index 3e7ee057734..62b2cea821e 100644 --- a/lucene/analysis/stempel/src/java/org/egothor/stemmer/Lift.java +++ b/lucene/analysis/stempel/src/java/org/egothor/stemmer/Lift.java @@ -58,6 +58,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; +import org.apache.lucene.util.hppc.ObjectCursor; /** * The Lift class is a data structure that is a variation of a Patricia trie. @@ -111,9 +112,9 @@ public class Lift extends Reduce { * @param nodes contains the patch commands */ public void liftUp(Row in, List nodes) { - Iterator i = in.cells.values().iterator(); + Iterator> i = in.cells.values().iterator(); for (; i.hasNext(); ) { - Cell c = i.next(); + Cell c = i.next().value; if (c.ref >= 0) { Row to = nodes.get(c.ref); int sum = to.uniformCmd(changeSkip); diff --git a/lucene/analysis/stempel/src/java/org/egothor/stemmer/Optimizer.java b/lucene/analysis/stempel/src/java/org/egothor/stemmer/Optimizer.java index f22d658039d..778e863b01c 100644 --- a/lucene/analysis/stempel/src/java/org/egothor/stemmer/Optimizer.java +++ b/lucene/analysis/stempel/src/java/org/egothor/stemmer/Optimizer.java @@ -58,6 +58,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; +import org.apache.lucene.util.hppc.CharCursor; /** * The Optimizer class is a Trie that will be reduced (have empty rows removed). @@ -116,10 +117,10 @@ public class Optimizer extends Reduce { * @return the resulting Row, or null if the operation cannot be realized */ public Row merge(Row master, Row existing) { - Iterator i = master.cells.keySet().iterator(); + Iterator i = master.cells.keys().iterator(); Row n = new Row(); for (; i.hasNext(); ) { - Character ch = i.next(); + char ch = i.next().value; // XXX also must handle Cnt and Skip !! Cell a = master.cells.get(ch); Cell b = existing.cells.get(ch); @@ -130,9 +131,9 @@ public class Optimizer extends Reduce { } n.cells.put(ch, s); } - i = existing.cells.keySet().iterator(); + i = existing.cells.keys().iterator(); for (; i.hasNext(); ) { - Character ch = i.next(); + char ch = i.next().value; if (master.at(ch) != null) { continue; } diff --git a/lucene/analysis/stempel/src/java/org/egothor/stemmer/Reduce.java b/lucene/analysis/stempel/src/java/org/egothor/stemmer/Reduce.java index da6a3c09f7a..ae89e1b81ab 100644 --- a/lucene/analysis/stempel/src/java/org/egothor/stemmer/Reduce.java +++ b/lucene/analysis/stempel/src/java/org/egothor/stemmer/Reduce.java @@ -58,6 +58,8 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; +import org.apache.lucene.util.hppc.CharCursor; +import org.apache.lucene.util.hppc.ObjectCursor; /** The Reduce object is used to remove gaps in a Trie which stores a dictionary. */ public class Reduce { @@ -88,9 +90,9 @@ public class Reduce { Row now = old.get(ind); to.add(now); - Iterator i = now.cells.values().iterator(); + Iterator> i = now.cells.values().iterator(); for (; i.hasNext(); ) { - Cell c = i.next(); + Cell c = i.next().value; if (c.ref >= 0 && remap[c.ref] < 0) { removeGaps(c.ref, old, to, remap); } @@ -109,9 +111,9 @@ public class Reduce { */ public Remap(Row old, int[] remap) { super(); - Iterator i = old.cells.keySet().iterator(); + Iterator i = old.cells.keys().iterator(); for (; i.hasNext(); ) { - Character ch = i.next(); + char ch = i.next().value; Cell c = old.at(ch); Cell nc; if (c.ref >= 0) { diff --git a/lucene/analysis/stempel/src/java/org/egothor/stemmer/Row.java b/lucene/analysis/stempel/src/java/org/egothor/stemmer/Row.java index b3615fb6ad3..3c8db8fdbb8 100644 --- a/lucene/analysis/stempel/src/java/org/egothor/stemmer/Row.java +++ b/lucene/analysis/stempel/src/java/org/egothor/stemmer/Row.java @@ -59,11 +59,13 @@ import java.io.DataOutput; import java.io.IOException; import java.io.PrintStream; import java.util.Iterator; -import java.util.TreeMap; +import org.apache.lucene.util.hppc.CharCursor; +import org.apache.lucene.util.hppc.CharObjectHashMap; +import org.apache.lucene.util.hppc.ObjectCursor; /** The Row class represents a row in a matrix representation of a trie. */ public class Row { - TreeMap cells = new TreeMap<>(); + CharObjectHashMap cells = new CharObjectHashMap<>(); int uniformCnt = 0; int uniformSkip = 0; @@ -98,12 +100,12 @@ public class Row { } /** - * Set the command in the Cell of the given Character to the given integer. + * Set the command in the Cell of the given character to the given integer. * - * @param way the Character defining the Cell + * @param way the character defining the Cell * @param cmd the new command */ - public void setCmd(Character way, int cmd) { + public void setCmd(char way, int cmd) { Cell c = at(way); if (c == null) { c = new Cell(); @@ -116,12 +118,12 @@ public class Row { } /** - * Set the reference to the next row in the Cell of the given Character to the given integer. + * Set the reference to the next row in the Cell of the given character to the given integer. * - * @param way the Character defining the Cell + * @param way the character defining the Cell * @param ref The new ref value */ - public void setRef(Character way, int ref) { + public void setRef(char way, int ref) { Cell c = at(way); if (c == null) { c = new Cell(); @@ -138,10 +140,10 @@ public class Row { * @return the number of cells in use */ public int getCells() { - Iterator i = cells.keySet().iterator(); + Iterator i = cells.keys().iterator(); int size = 0; for (; i.hasNext(); ) { - Character c = i.next(); + char c = i.next().value; Cell e = at(c); if (e.cmd >= 0 || e.ref >= 0) { size++; @@ -156,10 +158,10 @@ public class Row { * @return the number of references */ public int getCellsPnt() { - Iterator i = cells.keySet().iterator(); + Iterator i = cells.keys().iterator(); int size = 0; for (; i.hasNext(); ) { - Character c = i.next(); + char c = i.next().value; Cell e = at(c); if (e.ref >= 0) { size++; @@ -174,10 +176,10 @@ public class Row { * @return the number of patch commands */ public int getCellsVal() { - Iterator i = cells.keySet().iterator(); + Iterator i = cells.keys().iterator(); int size = 0; for (; i.hasNext(); ) { - Character c = i.next(); + char c = i.next().value; Cell e = at(c); if (e.cmd >= 0) { size++; @@ -187,35 +189,35 @@ public class Row { } /** - * Return the command in the Cell associated with the given Character. + * Return the command in the Cell associated with the given character. * - * @param way the Character associated with the Cell holding the desired command + * @param way the character associated with the Cell holding the desired command * @return the command */ - public int getCmd(Character way) { + public int getCmd(char way) { Cell c = at(way); return (c == null) ? -1 : c.cmd; } /** - * Return the number of patch commands were in the Cell associated with the given Character before + * Return the number of patch commands were in the Cell associated with the given character before * the Trie containing this Row was reduced. * - * @param way the Character associated with the desired Cell + * @param way the character associated with the desired Cell * @return the number of patch commands before reduction */ - public int getCnt(Character way) { + public int getCnt(char way) { Cell c = at(way); return (c == null) ? -1 : c.cnt; } /** - * Return the reference to the next Row in the Cell associated with the given Character. + * Return the reference to the next Row in the Cell associated with the given character. * - * @param way the Character associated with the desired Cell + * @param way the character associated with the desired Cell * @return the reference, or -1 if the Cell is null */ - public int getRef(Character way) { + public int getRef(char way) { Cell c = at(way); return (c == null) ? -1 : c.ref; } @@ -228,15 +230,15 @@ public class Row { */ public void store(DataOutput os) throws IOException { os.writeInt(cells.size()); - Iterator i = cells.keySet().iterator(); + Iterator i = cells.keys().iterator(); for (; i.hasNext(); ) { - Character c = i.next(); + char c = i.next().value; Cell e = at(c); if (e.cmd < 0 && e.ref < 0) { continue; } - os.writeChar(c.charValue()); + os.writeChar(c); os.writeInt(e.cmd); os.writeInt(e.cnt); os.writeInt(e.ref); @@ -251,12 +253,12 @@ public class Row { * @return the number of identical Cells, or -1 if there are (at least) two different cells */ public int uniformCmd(boolean eqSkip) { - Iterator i = cells.values().iterator(); + Iterator> i = cells.values().iterator(); int ret = -1; uniformCnt = 1; uniformSkip = 0; for (; i.hasNext(); ) { - Cell c = i.next(); + Cell c = i.next().value; if (c.ref >= 0) { return -1; } @@ -284,15 +286,15 @@ public class Row { /** Write the contents of this Row to the printstream. */ public void print(PrintStream out) { - for (Iterator i = cells.keySet().iterator(); i.hasNext(); ) { - Character ch = i.next(); + for (Iterator i = cells.keys().iterator(); i.hasNext(); ) { + char ch = i.next().value; Cell c = at(ch); out.print("[" + ch + ":" + c + "]"); } out.println(); } - Cell at(Character index) { + Cell at(char index) { return cells.get(index); } } diff --git a/lucene/analysis/stempel/src/java/org/egothor/stemmer/Trie.java b/lucene/analysis/stempel/src/java/org/egothor/stemmer/Trie.java index aef5d189747..b350c64a750 100644 --- a/lucene/analysis/stempel/src/java/org/egothor/stemmer/Trie.java +++ b/lucene/analysis/stempel/src/java/org/egothor/stemmer/Trie.java @@ -134,7 +134,7 @@ public class Trie { boolean br = false; for (int i = 0; i < key.length() - 1; i++) { - Character ch = e.next(); + char ch = e.next(); w = now.getCmd(ch); if (w >= 0) { int n = w; @@ -227,7 +227,7 @@ public class Trie { Cell c; int cmd = -1; StrEnum e = new StrEnum(key, forward); - Character ch = null; + char ch; for (int i = 0; i < key.length(); ) { ch = e.next(); @@ -272,7 +272,7 @@ public class Trie { StrEnum e = new StrEnum(key, forward); for (int i = 0; i < key.length() - 1; i++) { - Character ch = e.next(); + char ch = e.next(); w = now.getCmd(ch); if (w >= 0) { last = cmds.get(w); @@ -343,7 +343,7 @@ public class Trie { StrEnum e = new StrEnum(key, forward); for (int i = 0; i < e.length() - 1; i++) { - Character ch = e.next(); + char ch = e.next(); node = r.getRef(ch); if (node >= 0) { r = getRow(node); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java index 190046e2a3f..1fc1fb3a7fa 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java @@ -281,8 +281,8 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer { private static List sortFieldNames( IntObjectHashMap fieldMap, FieldInfos fieldInfos) { List fieldNames = new ArrayList<>(fieldMap.size()); - for (IntCursor fieldNumberCursor : fieldMap.keys()) { - fieldNames.add(fieldInfos.fieldInfo(fieldNumberCursor.value).name); + for (IntCursor fieldNumber : fieldMap.keys()) { + fieldNames.add(fieldInfos.fieldInfo(fieldNumber.value).name); } fieldNames.sort(null); return Collections.unmodifiableList(fieldNames); diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StateSet.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StateSet.java index 5bcf133433f..f21ed0a1d1e 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/StateSet.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StateSet.java @@ -95,8 +95,8 @@ final class StateSet extends IntSet { } arrayCache = new int[inner.size()]; int i = 0; - for (IntCursor cursor : inner.keys()) { - arrayCache[i++] = cursor.value; + for (IntCursor key : inner.keys()) { + arrayCache[i++] = key.value; } // we need to sort this array since "equals" method depend on this Arrays.sort(arrayCache); @@ -115,8 +115,8 @@ final class StateSet extends IntSet { return hashCode; } hashCode = inner.size(); - for (IntCursor cursor : inner.keys()) { - hashCode += BitMixer.mix(cursor.value); + for (IntCursor key : inner.keys()) { + hashCode += BitMixer.mix(key.value); } hashUpdated = true; return hashCode; diff --git a/lucene/core/src/java/org/apache/lucene/util/hppc/CharCursor.java b/lucene/core/src/java/org/apache/lucene/util/hppc/CharCursor.java new file mode 100644 index 00000000000..6f18a87b0a1 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/hppc/CharCursor.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.util.hppc; + +/** Forked from HPPC, holding int index and char value */ +public final class CharCursor { + /** + * The current value's index in the container this cursor belongs to. The meaning of this index is + * defined by the container (usually it will be an index in the underlying storage buffer). + */ + public int index; + + /** The current value. */ + public char value; + + @Override + public String toString() { + return "[cursor, index: " + index + ", value: " + value + "]"; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/util/hppc/CharHashSet.java b/lucene/core/src/java/org/apache/lucene/util/hppc/CharHashSet.java new file mode 100644 index 00000000000..c35f87afd8a --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/hppc/CharHashSet.java @@ -0,0 +1,693 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.util.hppc; + +import static org.apache.lucene.util.hppc.HashContainers.DEFAULT_EXPECTED_ELEMENTS; +import static org.apache.lucene.util.hppc.HashContainers.DEFAULT_LOAD_FACTOR; +import static org.apache.lucene.util.hppc.HashContainers.ITERATION_SEED; +import static org.apache.lucene.util.hppc.HashContainers.MAX_LOAD_FACTOR; +import static org.apache.lucene.util.hppc.HashContainers.MIN_LOAD_FACTOR; +import static org.apache.lucene.util.hppc.HashContainers.checkLoadFactor; +import static org.apache.lucene.util.hppc.HashContainers.expandAtCount; +import static org.apache.lucene.util.hppc.HashContainers.iterationIncrement; +import static org.apache.lucene.util.hppc.HashContainers.minBufferSize; +import static org.apache.lucene.util.hppc.HashContainers.nextBufferSize; + +import java.util.Arrays; +import java.util.Iterator; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.RamUsageEstimator; + +/** + * A hash set of chars, implemented using open addressing with linear probing for + * collision resolution. + * + *

Mostly forked and trimmed from com.carrotsearch.hppc.CharHashSet + * + *

github: https://github.com/carrotsearch/hppc release 0.9.0 + */ +public class CharHashSet implements Iterable, Accountable, Cloneable { + + private static final long BASE_RAM_BYTES_USED = + RamUsageEstimator.shallowSizeOfInstance(CharHashSet.class); + + private static final char EMPTY_KEY = (char) 0; + + /** The hash array holding keys. */ + public char[] keys; + + /** + * The number of stored keys (assigned key slots), excluding the special "empty" key, if any. + * + * @see #size() + * @see #hasEmptyKey + */ + protected int assigned; + + /** Mask for slot scans in {@link #keys}. */ + protected int mask; + + /** Expand (rehash) {@link #keys} when {@link #assigned} hits this value. */ + protected int resizeAt; + + /** Special treatment for the "empty slot" key marker. */ + protected boolean hasEmptyKey; + + /** The load factor for {@link #keys}. */ + protected double loadFactor; + + /** Seed used to ensure the hash iteration order is different from an iteration to another. */ + protected int iterationSeed; + + /** New instance with sane defaults. */ + public CharHashSet() { + this(DEFAULT_EXPECTED_ELEMENTS); + } + + /** + * New instance with sane defaults. + * + * @param expectedElements The expected number of elements guaranteed not to cause a rehash + * (inclusive). + */ + public CharHashSet(int expectedElements) { + this(expectedElements, DEFAULT_LOAD_FACTOR); + } + + /** + * New instance with the provided defaults. + * + * @param expectedElements The expected number of elements guaranteed not to cause a rehash + * (inclusive). + * @param loadFactor The load factor for internal buffers. Insane load factors (zero, full + * capacity) are rejected by {@link #verifyLoadFactor(double)}. + */ + public CharHashSet(int expectedElements, double loadFactor) { + this.loadFactor = verifyLoadFactor(loadFactor); + iterationSeed = ITERATION_SEED.incrementAndGet(); + ensureCapacity(expectedElements); + } + + /** New instance copying elements from another set. */ + public CharHashSet(CharHashSet set) { + this(set.size()); + addAll(set); + } + + public boolean add(char key) { + if (((key) == 0)) { + assert ((keys[mask + 1]) == 0); + boolean added = !hasEmptyKey; + hasEmptyKey = true; + return added; + } else { + final char[] keys = this.keys; + final int mask = this.mask; + int slot = hashKey(key) & mask; + + char existing; + while (!((existing = keys[slot]) == 0)) { + if (((key) == (existing))) { + return false; + } + slot = (slot + 1) & mask; + } + + if (assigned == resizeAt) { + allocateThenInsertThenRehash(slot, key); + } else { + keys[slot] = key; + } + + assigned++; + return true; + } + } + + /** + * Adds all elements from the given list (vararg) to this set. + * + * @return Returns the number of elements actually added as a result of this call (not previously + * present in the set). + */ + public final int addAll(char... elements) { + ensureCapacity(elements.length); + int count = 0; + for (char e : elements) { + if (add(e)) { + count++; + } + } + return count; + } + + /** + * Adds all elements from the given set to this set. + * + * @return Returns the number of elements actually added as a result of this call (not previously + * present in the set). + */ + public int addAll(CharHashSet set) { + ensureCapacity(set.size()); + return addAll((Iterable) set); + } + + /** + * Adds all elements from the given iterable to this set. + * + * @return Returns the number of elements actually added as a result of this call (not previously + * present in the set). + */ + public int addAll(Iterable iterable) { + int count = 0; + for (CharCursor cursor : iterable) { + if (add(cursor.value)) { + count++; + } + } + return count; + } + + public char[] toArray() { + + final char[] cloned = (new char[size()]); + int j = 0; + if (hasEmptyKey) { + cloned[j++] = EMPTY_KEY; + } + + final char[] keys = this.keys; + int seed = nextIterationSeed(); + int inc = iterationIncrement(seed); + for (int i = 0, mask = this.mask, slot = seed & mask; + i <= mask; + i++, slot = (slot + inc) & mask) { + char existing; + if (!((existing = keys[slot]) == 0)) { + cloned[j++] = existing; + } + } + + return cloned; + } + + /** An alias for the (preferred) {@link #removeAll}. */ + public boolean remove(char key) { + if (((key) == 0)) { + boolean hadEmptyKey = hasEmptyKey; + hasEmptyKey = false; + return hadEmptyKey; + } else { + final char[] keys = this.keys; + final int mask = this.mask; + int slot = hashKey(key) & mask; + + char existing; + while (!((existing = keys[slot]) == 0)) { + if (((key) == (existing))) { + shiftConflictingKeys(slot); + return true; + } + slot = (slot + 1) & mask; + } + return false; + } + } + + /** + * Removes all keys present in a given container. + * + * @return Returns the number of elements actually removed as a result of this call. + */ + public int removeAll(CharHashSet other) { + final int before = size(); + + // Try to iterate over the smaller set or over the container that isn't implementing + // efficient contains() lookup. + + if (other.size() >= size()) { + if (hasEmptyKey && other.contains(EMPTY_KEY)) { + hasEmptyKey = false; + } + + final char[] keys = this.keys; + for (int slot = 0, max = this.mask; slot <= max; ) { + char existing; + if (!((existing = keys[slot]) == 0) && other.contains(existing)) { + // Shift, do not increment slot. + shiftConflictingKeys(slot); + } else { + slot++; + } + } + } else { + for (CharCursor c : other) { + remove(c.value); + } + } + + return before - size(); + } + + public boolean contains(char key) { + if (((key) == 0)) { + return hasEmptyKey; + } else { + final char[] keys = this.keys; + final int mask = this.mask; + int slot = hashKey(key) & mask; + char existing; + while (!((existing = keys[slot]) == 0)) { + if (((key) == (existing))) { + return true; + } + slot = (slot + 1) & mask; + } + return false; + } + } + + public void clear() { + assigned = 0; + hasEmptyKey = false; + Arrays.fill(keys, EMPTY_KEY); + } + + public void release() { + assigned = 0; + hasEmptyKey = false; + keys = null; + ensureCapacity(DEFAULT_EXPECTED_ELEMENTS); + } + + public boolean isEmpty() { + return size() == 0; + } + + /** + * Ensure this container can hold at least the given number of elements without resizing its + * buffers. + * + * @param expectedElements The total number of elements, inclusive. + */ + public void ensureCapacity(int expectedElements) { + if (expectedElements > resizeAt || keys == null) { + final char[] prevKeys = this.keys; + allocateBuffers(minBufferSize(expectedElements, loadFactor)); + if (prevKeys != null && !isEmpty()) { + rehash(prevKeys); + } + } + } + + public int size() { + return assigned + (hasEmptyKey ? 1 : 0); + } + + @Override + public int hashCode() { + int h = hasEmptyKey ? 0xDEADBEEF : 0; + final char[] keys = this.keys; + for (int slot = mask; slot >= 0; slot--) { + char existing; + if (!((existing = keys[slot]) == 0)) { + h += BitMixer.mix(existing); + } + } + return h; + } + + @Override + public boolean equals(Object obj) { + return (this == obj) + || (obj != null && getClass() == obj.getClass() && sameKeys(getClass().cast(obj))); + } + + /** Return true if all keys of some other container exist in this container. */ + private boolean sameKeys(CharHashSet other) { + if (other.size() != size()) { + return false; + } + + for (CharCursor c : other) { + if (!contains(c.value)) { + return false; + } + } + + return true; + } + + @Override + public CharHashSet clone() { + try { + /* */ + CharHashSet cloned = (CharHashSet) super.clone(); + cloned.keys = keys.clone(); + cloned.hasEmptyKey = hasEmptyKey; + cloned.iterationSeed = ITERATION_SEED.incrementAndGet(); + return cloned; + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + } + + @Override + public Iterator iterator() { + return new EntryIterator(); + } + + @Override + public long ramBytesUsed() { + return BASE_RAM_BYTES_USED + RamUsageEstimator.sizeOf(keys); + } + + /** + * Provides the next iteration seed used to build the iteration starting slot and offset + * increment. This method does not need to be synchronized, what matters is that each thread gets + * a sequence of varying seeds. + */ + protected int nextIterationSeed() { + return iterationSeed = BitMixer.mixPhi(iterationSeed); + } + + /** An iterator implementation for {@link #iterator}. */ + protected final class EntryIterator extends AbstractIterator { + private final CharCursor cursor; + private final int increment; + private int index; + private int slot; + + public EntryIterator() { + cursor = new CharCursor(); + int seed = nextIterationSeed(); + increment = iterationIncrement(seed); + slot = seed & mask; + } + + @Override + protected CharCursor fetch() { + final int mask = CharHashSet.this.mask; + while (index <= mask) { + char existing; + index++; + slot = (slot + increment) & mask; + if (!((existing = keys[slot]) == 0)) { + cursor.index = slot; + cursor.value = existing; + return cursor; + } + } + + if (index == mask + 1 && hasEmptyKey) { + cursor.index = index++; + cursor.value = EMPTY_KEY; + return cursor; + } + + return done(); + } + } + + /** + * Create a set from a variable number of arguments or an array of char. The elements + * are copied from the argument to the internal buffer. + */ + /* */ + public static CharHashSet from(char... elements) { + final CharHashSet set = new CharHashSet(elements.length); + set.addAll(elements); + return set; + } + + /** + * Returns a hash code for the given key. + * + *

The output from this function should evenly distribute keys across the entire integer range. + */ + protected int hashKey(char key) { + assert !((key) == 0); // Handled as a special case (empty slot marker). + return BitMixer.mixPhi(key); + } + + /** + * Returns a logical "index" of a given key that can be used to speed up follow-up logic in + * certain scenarios (conditional logic). + * + *

The semantics of "indexes" are not strictly defined. Indexes may (and typically won't be) + * contiguous. + * + *

The index is valid only between modifications (it will not be affected by read-only + * operations). + * + * @see #indexExists + * @see #indexGet + * @see #indexInsert + * @see #indexReplace + * @param key The key to locate in the set. + * @return A non-negative value of the logical "index" of the key in the set or a negative value + * if the key did not exist. + */ + public int indexOf(char key) { + final int mask = this.mask; + if (((key) == 0)) { + return hasEmptyKey ? mask + 1 : ~(mask + 1); + } else { + final char[] keys = this.keys; + int slot = hashKey(key) & mask; + + char existing; + while (!((existing = keys[slot]) == 0)) { + if (((key) == (existing))) { + return slot; + } + slot = (slot + 1) & mask; + } + + return ~slot; + } + } + + /** + * @see #indexOf + * @param index The index of a given key, as returned from {@link #indexOf}. + * @return Returns true if the index corresponds to an existing key or false + * otherwise. This is equivalent to checking whether the index is a positive value (existing + * keys) or a negative value (non-existing keys). + */ + public boolean indexExists(int index) { + assert index < 0 || index <= mask || (index == mask + 1 && hasEmptyKey); + + return index >= 0; + } + + /** + * Returns the exact value of the existing key. This method makes sense for sets of objects which + * define custom key-equality relationship. + * + * @see #indexOf + * @param index The index of an existing key. + * @return Returns the equivalent key currently stored in the set. + * @throws AssertionError If assertions are enabled and the index does not correspond to an + * existing key. + */ + public char indexGet(int index) { + assert index >= 0 : "The index must point at an existing key."; + assert index <= mask || (index == mask + 1 && hasEmptyKey); + + return keys[index]; + } + + /** + * Replaces the existing equivalent key with the given one and returns any previous value stored + * for that key. + * + * @see #indexOf + * @param index The index of an existing key. + * @param equivalentKey The key to put in the set as a replacement. Must be equivalent to the key + * currently stored at the provided index. + * @return Returns the previous key stored in the set. + * @throws AssertionError If assertions are enabled and the index does not correspond to an + * existing key. + */ + public char indexReplace(int index, char equivalentKey) { + assert index >= 0 : "The index must point at an existing key."; + assert index <= mask || (index == mask + 1 && hasEmptyKey); + assert ((keys[index]) == (equivalentKey)); + + char previousValue = keys[index]; + keys[index] = equivalentKey; + return previousValue; + } + + /** + * Inserts a key for an index that is not present in the set. This method may help in avoiding + * double recalculation of the key's hash. + * + * @see #indexOf + * @param index The index of a previously non-existing key, as returned from {@link #indexOf}. + * @throws AssertionError If assertions are enabled and the index does not correspond to an + * existing key. + */ + public void indexInsert(int index, char key) { + assert index < 0 : "The index must not point at an existing key."; + + index = ~index; + if (((key) == 0)) { + assert index == mask + 1; + assert ((keys[index]) == 0); + hasEmptyKey = true; + } else { + assert ((keys[index]) == 0); + + if (assigned == resizeAt) { + allocateThenInsertThenRehash(index, key); + } else { + keys[index] = key; + } + + assigned++; + } + } + + /** + * Removes a key at an index previously acquired from {@link #indexOf}. + * + * @see #indexOf + * @param index The index of the key to remove, as returned from {@link #indexOf}. + * @throws AssertionError If assertions are enabled and the index does not correspond to an + * existing key. + */ + public void indexRemove(int index) { + assert index >= 0 : "The index must point at an existing key."; + assert index <= mask || (index == mask + 1 && hasEmptyKey); + + if (index > mask) { + hasEmptyKey = false; + } else { + shiftConflictingKeys(index); + } + } + + /** + * Validate load factor range and return it. Override and suppress if you need insane load + * factors. + */ + protected double verifyLoadFactor(double loadFactor) { + checkLoadFactor(loadFactor, MIN_LOAD_FACTOR, MAX_LOAD_FACTOR); + return loadFactor; + } + + /** Rehash from old buffers to new buffers. */ + protected void rehash(char[] fromKeys) { + assert HashContainers.checkPowerOfTwo(fromKeys.length - 1); + + // Rehash all stored keys into the new buffers. + final char[] keys = this.keys; + final int mask = this.mask; + char existing; + for (int i = fromKeys.length - 1; --i >= 0; ) { + if (!((existing = fromKeys[i]) == 0)) { + int slot = hashKey(existing) & mask; + while (!((keys[slot]) == 0)) { + slot = (slot + 1) & mask; + } + keys[slot] = existing; + } + } + } + + /** + * Allocate new internal buffers. This method attempts to allocate and assign internal buffers + * atomically (either allocations succeed or not). + */ + protected void allocateBuffers(int arraySize) { + assert Integer.bitCount(arraySize) == 1; + + // Ensure no change is done if we hit an OOM. + char[] prevKeys = this.keys; + try { + int emptyElementSlot = 1; + this.keys = (new char[arraySize + emptyElementSlot]); + } catch (OutOfMemoryError e) { + this.keys = prevKeys; + throw new BufferAllocationException( + "Not enough memory to allocate buffers for rehashing: %,d -> %,d", + e, this.keys == null ? 0 : size(), arraySize); + } + + this.resizeAt = expandAtCount(arraySize, loadFactor); + this.mask = arraySize - 1; + } + + /** + * This method is invoked when there is a new key to be inserted into the buffer but there is not + * enough empty slots to do so. + * + *

New buffers are allocated. If this succeeds, we know we can proceed with rehashing so we + * assign the pending element to the previous buffer (possibly violating the invariant of having + * at least one empty slot) and rehash all keys, substituting new buffers at the end. + */ + protected void allocateThenInsertThenRehash(int slot, char pendingKey) { + assert assigned == resizeAt && ((keys[slot]) == 0) && !((pendingKey) == 0); + + // Try to allocate new buffers first. If we OOM, we leave in a consistent state. + final char[] prevKeys = this.keys; + allocateBuffers(nextBufferSize(mask + 1, size(), loadFactor)); + assert this.keys.length > prevKeys.length; + + // We have succeeded at allocating new data so insert the pending key/value at + // the free slot in the old arrays before rehashing. + prevKeys[slot] = pendingKey; + + // Rehash old keys, including the pending key. + rehash(prevKeys); + } + + /** Shift all the slot-conflicting keys allocated to (and including) slot. */ + protected void shiftConflictingKeys(int gapSlot) { + final char[] keys = this.keys; + final int mask = this.mask; + + // Perform shifts of conflicting keys to fill in the gap. + int distance = 0; + while (true) { + final int slot = (gapSlot + (++distance)) & mask; + final char existing = keys[slot]; + if (((existing) == 0)) { + break; + } + + final int idealSlot = hashKey(existing); + final int shift = (slot - idealSlot) & mask; + if (shift >= distance) { + // Entry at this position was originally at or before the gap slot. + // Move the conflict-shifted entry to the gap's position and repeat the procedure + // for any entries to the right of the current position, treating it + // as the new gap. + keys[gapSlot] = existing; + gapSlot = slot; + distance = 0; + } + } + + // Mark the last found gap slot without a conflict as empty. + keys[gapSlot] = EMPTY_KEY; + assigned--; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/util/hppc/CharObjectHashMap.java b/lucene/core/src/java/org/apache/lucene/util/hppc/CharObjectHashMap.java new file mode 100644 index 00000000000..6db9e4affb1 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/hppc/CharObjectHashMap.java @@ -0,0 +1,827 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.util.hppc; + +import static org.apache.lucene.util.hppc.HashContainers.DEFAULT_EXPECTED_ELEMENTS; +import static org.apache.lucene.util.hppc.HashContainers.DEFAULT_LOAD_FACTOR; +import static org.apache.lucene.util.hppc.HashContainers.ITERATION_SEED; +import static org.apache.lucene.util.hppc.HashContainers.MAX_LOAD_FACTOR; +import static org.apache.lucene.util.hppc.HashContainers.MIN_LOAD_FACTOR; +import static org.apache.lucene.util.hppc.HashContainers.checkLoadFactor; +import static org.apache.lucene.util.hppc.HashContainers.checkPowerOfTwo; +import static org.apache.lucene.util.hppc.HashContainers.expandAtCount; +import static org.apache.lucene.util.hppc.HashContainers.iterationIncrement; +import static org.apache.lucene.util.hppc.HashContainers.minBufferSize; +import static org.apache.lucene.util.hppc.HashContainers.nextBufferSize; + +import java.util.Arrays; +import java.util.Iterator; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.RamUsageEstimator; + +/** + * A hash map of char to Object, implemented using open addressing with + * linear probing for collision resolution. Supports null values. + * + *

Mostly forked and trimmed from com.carrotsearch.hppc.CharObjectHashMap + * + *

github: https://github.com/carrotsearch/hppc release 0.9.0 + */ +@SuppressWarnings("unchecked") +public class CharObjectHashMap + implements Iterable>, Accountable, Cloneable { + + private static final long BASE_RAM_BYTES_USED = + RamUsageEstimator.shallowSizeOfInstance(CharObjectHashMap.class); + + private static final char EMPTY_KEY = (char) 0; + + /** The array holding keys. */ + public char[] keys; + + /** The array holding values. */ + public Object[] values; + + /** + * The number of stored keys (assigned key slots), excluding the special "empty" key, if any (use + * {@link #size()} instead). + * + * @see #size() + */ + protected int assigned; + + /** Mask for slot scans in {@link #keys}. */ + protected int mask; + + /** Expand (rehash) {@link #keys} when {@link #assigned} hits this value. */ + protected int resizeAt; + + /** Special treatment for the "empty slot" key marker. */ + protected boolean hasEmptyKey; + + /** The load factor for {@link #keys}. */ + protected double loadFactor; + + /** Seed used to ensure the hash iteration order is different from an iteration to another. */ + protected int iterationSeed; + + /** New instance with sane defaults. */ + public CharObjectHashMap() { + this(DEFAULT_EXPECTED_ELEMENTS); + } + + /** + * New instance with sane defaults. + * + * @param expectedElements The expected number of elements guaranteed not to cause buffer + * expansion (inclusive). + */ + public CharObjectHashMap(int expectedElements) { + this(expectedElements, DEFAULT_LOAD_FACTOR); + } + + /** + * New instance with the provided defaults. + * + * @param expectedElements The expected number of elements guaranteed not to cause a rehash + * (inclusive). + * @param loadFactor The load factor for internal buffers. Insane load factors (zero, full + * capacity) are rejected by {@link #verifyLoadFactor(double)}. + */ + public CharObjectHashMap(int expectedElements, double loadFactor) { + this.loadFactor = verifyLoadFactor(loadFactor); + iterationSeed = ITERATION_SEED.incrementAndGet(); + ensureCapacity(expectedElements); + } + + /** Create a hash map from all key-value pairs of another map. */ + public CharObjectHashMap(CharObjectHashMap map) { + this(map.size()); + putAll(map); + } + + public VType put(char key, VType value) { + assert assigned < mask + 1; + + final int mask = this.mask; + if (((key) == 0)) { + VType previousValue = hasEmptyKey ? (VType) values[mask + 1] : null; + hasEmptyKey = true; + values[mask + 1] = value; + return previousValue; + } else { + final char[] keys = this.keys; + int slot = hashKey(key) & mask; + + char existing; + while (!((existing = keys[slot]) == 0)) { + if (((existing) == (key))) { + final VType previousValue = (VType) values[slot]; + values[slot] = value; + return previousValue; + } + slot = (slot + 1) & mask; + } + + if (assigned == resizeAt) { + allocateThenInsertThenRehash(slot, key, value); + } else { + keys[slot] = key; + values[slot] = value; + } + + assigned++; + return null; + } + } + + public int putAll(Iterable> iterable) { + final int count = size(); + for (CharObjectCursor c : iterable) { + put(c.key, c.value); + } + return size() - count; + } + + /** + * Trove-inspired API method. An equivalent of the + * following code: + * + *

+   * if (!map.containsKey(key)) map.put(value);
+   * 
+ * + * @param key The key of the value to check. + * @param value The value to put if key does not exist. + * @return true if key did not exist and value was placed + * in the map. + */ + public boolean putIfAbsent(char key, VType value) { + int keyIndex = indexOf(key); + if (!indexExists(keyIndex)) { + indexInsert(keyIndex, key, value); + return true; + } else { + return false; + } + } + + public VType remove(char key) { + final int mask = this.mask; + if (((key) == 0)) { + if (!hasEmptyKey) { + return null; + } + hasEmptyKey = false; + VType previousValue = (VType) values[mask + 1]; + values[mask + 1] = 0; + return previousValue; + } else { + final char[] keys = this.keys; + int slot = hashKey(key) & mask; + + char existing; + while (!((existing = keys[slot]) == 0)) { + if (((existing) == (key))) { + final VType previousValue = (VType) values[slot]; + shiftConflictingKeys(slot); + return previousValue; + } + slot = (slot + 1) & mask; + } + + return null; + } + } + + public VType get(char key) { + if (((key) == 0)) { + return hasEmptyKey ? (VType) values[mask + 1] : null; + } else { + final char[] keys = this.keys; + final int mask = this.mask; + int slot = hashKey(key) & mask; + + char existing; + while (!((existing = keys[slot]) == 0)) { + if (((existing) == (key))) { + return (VType) values[slot]; + } + slot = (slot + 1) & mask; + } + + return null; + } + } + + public VType getOrDefault(char key, VType defaultValue) { + if (((key) == 0)) { + return hasEmptyKey ? (VType) values[mask + 1] : defaultValue; + } else { + final char[] keys = this.keys; + final int mask = this.mask; + int slot = hashKey(key) & mask; + + char existing; + while (!((existing = keys[slot]) == 0)) { + if (((existing) == (key))) { + return (VType) values[slot]; + } + slot = (slot + 1) & mask; + } + + return defaultValue; + } + } + + public boolean containsKey(char key) { + if (((key) == 0)) { + return hasEmptyKey; + } else { + final char[] keys = this.keys; + final int mask = this.mask; + int slot = hashKey(key) & mask; + + char existing; + while (!((existing = keys[slot]) == 0)) { + if (((existing) == (key))) { + return true; + } + slot = (slot + 1) & mask; + } + + return false; + } + } + + public int indexOf(char key) { + final int mask = this.mask; + if (((key) == 0)) { + return hasEmptyKey ? mask + 1 : ~(mask + 1); + } else { + final char[] keys = this.keys; + int slot = hashKey(key) & mask; + + char existing; + while (!((existing = keys[slot]) == 0)) { + if (((existing) == (key))) { + return slot; + } + slot = (slot + 1) & mask; + } + + return ~slot; + } + } + + public boolean indexExists(int index) { + assert index < 0 || (index >= 0 && index <= mask) || (index == mask + 1 && hasEmptyKey); + + return index >= 0; + } + + public VType indexGet(int index) { + assert index >= 0 : "The index must point at an existing key."; + assert index <= mask || (index == mask + 1 && hasEmptyKey); + + return (VType) values[index]; + } + + public VType indexReplace(int index, VType newValue) { + assert index >= 0 : "The index must point at an existing key."; + assert index <= mask || (index == mask + 1 && hasEmptyKey); + + VType previousValue = (VType) values[index]; + values[index] = newValue; + return previousValue; + } + + public void indexInsert(int index, char key, VType value) { + assert index < 0 : "The index must not point at an existing key."; + + index = ~index; + if (((key) == 0)) { + assert index == mask + 1; + values[index] = value; + hasEmptyKey = true; + } else { + assert ((keys[index]) == 0); + + if (assigned == resizeAt) { + allocateThenInsertThenRehash(index, key, value); + } else { + keys[index] = key; + values[index] = value; + } + + assigned++; + } + } + + public VType indexRemove(int index) { + assert index >= 0 : "The index must point at an existing key."; + assert index <= mask || (index == mask + 1 && hasEmptyKey); + + VType previousValue = (VType) values[index]; + if (index > mask) { + assert index == mask + 1; + hasEmptyKey = false; + values[index] = 0; + } else { + shiftConflictingKeys(index); + } + return previousValue; + } + + public void clear() { + assigned = 0; + hasEmptyKey = false; + + Arrays.fill(keys, EMPTY_KEY); + + /* */ + } + + public void release() { + assigned = 0; + hasEmptyKey = false; + + keys = null; + values = null; + ensureCapacity(DEFAULT_EXPECTED_ELEMENTS); + } + + public int size() { + return assigned + (hasEmptyKey ? 1 : 0); + } + + public boolean isEmpty() { + return size() == 0; + } + + @Override + public int hashCode() { + int h = hasEmptyKey ? 0xDEADBEEF : 0; + for (CharObjectCursor c : this) { + h += BitMixer.mix(c.key) + BitMixer.mix(c.value); + } + return h; + } + + @Override + public boolean equals(Object obj) { + return (this == obj) + || (obj != null && getClass() == obj.getClass() && equalElements(getClass().cast(obj))); + } + + /** Return true if all keys of some other container exist in this container. */ + protected boolean equalElements(CharObjectHashMap other) { + if (other.size() != size()) { + return false; + } + + for (CharObjectCursor c : other) { + char key = c.key; + if (!containsKey(key) || !java.util.Objects.equals(c.value, get(key))) { + return false; + } + } + + return true; + } + + /** + * Ensure this container can hold at least the given number of keys (entries) without resizing its + * buffers. + * + * @param expectedElements The total number of keys, inclusive. + */ + public void ensureCapacity(int expectedElements) { + if (expectedElements > resizeAt || keys == null) { + final char[] prevKeys = this.keys; + final VType[] prevValues = (VType[]) this.values; + allocateBuffers(minBufferSize(expectedElements, loadFactor)); + if (prevKeys != null && !isEmpty()) { + rehash(prevKeys, prevValues); + } + } + } + + /** + * Provides the next iteration seed used to build the iteration starting slot and offset + * increment. This method does not need to be synchronized, what matters is that each thread gets + * a sequence of varying seeds. + */ + protected int nextIterationSeed() { + return iterationSeed = BitMixer.mixPhi(iterationSeed); + } + + @Override + public Iterator> iterator() { + return new EntryIterator(); + } + + @Override + public long ramBytesUsed() { + return BASE_RAM_BYTES_USED + RamUsageEstimator.sizeOf(keys) + sizeOfValues(); + } + + private long sizeOfValues() { + long size = RamUsageEstimator.shallowSizeOf(values); + for (ObjectCursor value : values()) { + size += RamUsageEstimator.sizeOfObject(value); + } + return size; + } + + /** An iterator implementation for {@link #iterator}. */ + private final class EntryIterator extends AbstractIterator> { + private final CharObjectCursor cursor; + private final int increment; + private int index; + private int slot; + + public EntryIterator() { + cursor = new CharObjectCursor(); + int seed = nextIterationSeed(); + increment = iterationIncrement(seed); + slot = seed & mask; + } + + @Override + protected CharObjectCursor fetch() { + final int mask = CharObjectHashMap.this.mask; + while (index <= mask) { + char existing; + index++; + slot = (slot + increment) & mask; + if (!((existing = keys[slot]) == 0)) { + cursor.index = slot; + cursor.key = existing; + cursor.value = (VType) values[slot]; + return cursor; + } + } + + if (index == mask + 1 && hasEmptyKey) { + cursor.index = index; + cursor.key = 0; + cursor.value = (VType) values[index++]; + return cursor; + } + + return done(); + } + } + + /** Returns a specialized view of the keys of this associated container. */ + public KeysContainer keys() { + return new KeysContainer(); + } + + /** A view of the keys inside this hash map. */ + public final class KeysContainer implements Iterable { + + @Override + public Iterator iterator() { + return new KeysIterator(); + } + + public int size() { + return CharObjectHashMap.this.size(); + } + + public char[] toArray() { + char[] array = new char[size()]; + int i = 0; + for (CharCursor cursor : this) { + array[i++] = cursor.value; + } + return array; + } + } + + /** An iterator over the set of assigned keys. */ + private final class KeysIterator extends AbstractIterator { + private final CharCursor cursor; + private final int increment; + private int index; + private int slot; + + public KeysIterator() { + cursor = new CharCursor(); + int seed = nextIterationSeed(); + increment = iterationIncrement(seed); + slot = seed & mask; + } + + @Override + protected CharCursor fetch() { + final int mask = CharObjectHashMap.this.mask; + while (index <= mask) { + char existing; + index++; + slot = (slot + increment) & mask; + if (!((existing = keys[slot]) == 0)) { + cursor.index = slot; + cursor.value = existing; + return cursor; + } + } + + if (index == mask + 1 && hasEmptyKey) { + cursor.index = index++; + cursor.value = 0; + return cursor; + } + + return done(); + } + } + + /** + * @return Returns a container with all values stored in this map. + */ + public ValuesContainer values() { + return new ValuesContainer(); + } + + /** A view over the set of values of this map. */ + public final class ValuesContainer implements Iterable> { + + @Override + public Iterator> iterator() { + return new ValuesIterator(); + } + + public int size() { + return CharObjectHashMap.this.size(); + } + + public VType[] toArray() { + VType[] array = (VType[]) new Object[size()]; + int i = 0; + for (ObjectCursor cursor : this) { + array[i++] = cursor.value; + } + return array; + } + } + + /** An iterator over the set of assigned values. */ + private final class ValuesIterator extends AbstractIterator> { + private final ObjectCursor cursor; + private final int increment; + private int index; + private int slot; + + public ValuesIterator() { + cursor = new ObjectCursor<>(); + int seed = nextIterationSeed(); + increment = iterationIncrement(seed); + slot = seed & mask; + } + + @Override + protected ObjectCursor fetch() { + final int mask = CharObjectHashMap.this.mask; + while (index <= mask) { + index++; + slot = (slot + increment) & mask; + if (!((keys[slot]) == 0)) { + cursor.index = slot; + cursor.value = (VType) values[slot]; + return cursor; + } + } + + if (index == mask + 1 && hasEmptyKey) { + cursor.index = index; + cursor.value = (VType) values[index++]; + return cursor; + } + + return done(); + } + } + + @Override + public CharObjectHashMap clone() { + try { + /* */ + CharObjectHashMap cloned = (CharObjectHashMap) super.clone(); + cloned.keys = keys.clone(); + cloned.values = values.clone(); + cloned.hasEmptyKey = hasEmptyKey; + cloned.iterationSeed = ITERATION_SEED.incrementAndGet(); + return cloned; + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } + } + + /** Convert the contents of this map to a human-friendly string. */ + @Override + public String toString() { + final StringBuilder buffer = new StringBuilder(); + buffer.append("["); + + boolean first = true; + for (CharObjectCursor cursor : this) { + if (!first) { + buffer.append(", "); + } + buffer.append(cursor.key); + buffer.append("=>"); + buffer.append(cursor.value); + first = false; + } + buffer.append("]"); + return buffer.toString(); + } + + /** Creates a hash map from two index-aligned arrays of key-value pairs. */ + public static CharObjectHashMap from(char[] keys, VType[] values) { + if (keys.length != values.length) { + throw new IllegalArgumentException( + "Arrays of keys and values must have an identical length."); + } + + CharObjectHashMap map = new CharObjectHashMap<>(keys.length); + for (int i = 0; i < keys.length; i++) { + map.put(keys[i], values[i]); + } + + return map; + } + + /** + * Returns a hash code for the given key. + * + *

The output from this function should evenly distribute keys across the entire integer range. + */ + protected int hashKey(char key) { + assert !((key) == 0); // Handled as a special case (empty slot marker). + return BitMixer.mixPhi(key); + } + + /** + * Validate load factor range and return it. Override and suppress if you need insane load + * factors. + */ + protected double verifyLoadFactor(double loadFactor) { + checkLoadFactor(loadFactor, MIN_LOAD_FACTOR, MAX_LOAD_FACTOR); + return loadFactor; + } + + /** Rehash from old buffers to new buffers. */ + protected void rehash(char[] fromKeys, VType[] fromValues) { + assert fromKeys.length == fromValues.length && checkPowerOfTwo(fromKeys.length - 1); + + // Rehash all stored key/value pairs into the new buffers. + final char[] keys = this.keys; + final VType[] values = (VType[]) this.values; + final int mask = this.mask; + char existing; + + // Copy the zero element's slot, then rehash everything else. + int from = fromKeys.length - 1; + keys[keys.length - 1] = fromKeys[from]; + values[values.length - 1] = fromValues[from]; + while (--from >= 0) { + if (!((existing = fromKeys[from]) == 0)) { + int slot = hashKey(existing) & mask; + while (!((keys[slot]) == 0)) { + slot = (slot + 1) & mask; + } + keys[slot] = existing; + values[slot] = fromValues[from]; + } + } + } + + /** + * Allocate new internal buffers. This method attempts to allocate and assign internal buffers + * atomically (either allocations succeed or not). + */ + protected void allocateBuffers(int arraySize) { + assert Integer.bitCount(arraySize) == 1; + + // Ensure no change is done if we hit an OOM. + char[] prevKeys = this.keys; + VType[] prevValues = (VType[]) this.values; + try { + int emptyElementSlot = 1; + this.keys = (new char[arraySize + emptyElementSlot]); + this.values = new Object[arraySize + emptyElementSlot]; + } catch (OutOfMemoryError e) { + this.keys = prevKeys; + this.values = prevValues; + throw new BufferAllocationException( + "Not enough memory to allocate buffers for rehashing: %,d -> %,d", + e, this.mask + 1, arraySize); + } + + this.resizeAt = expandAtCount(arraySize, loadFactor); + this.mask = arraySize - 1; + } + + /** + * This method is invoked when there is a new key/ value pair to be inserted into the buffers but + * there is not enough empty slots to do so. + * + *

New buffers are allocated. If this succeeds, we know we can proceed with rehashing so we + * assign the pending element to the previous buffer (possibly violating the invariant of having + * at least one empty slot) and rehash all keys, substituting new buffers at the end. + */ + protected void allocateThenInsertThenRehash(int slot, char pendingKey, VType pendingValue) { + assert assigned == resizeAt && ((keys[slot]) == 0) && !((pendingKey) == 0); + + // Try to allocate new buffers first. If we OOM, we leave in a consistent state. + final char[] prevKeys = this.keys; + final VType[] prevValues = (VType[]) this.values; + allocateBuffers(nextBufferSize(mask + 1, size(), loadFactor)); + assert this.keys.length > prevKeys.length; + + // We have succeeded at allocating new data so insert the pending key/value at + // the free slot in the old arrays before rehashing. + prevKeys[slot] = pendingKey; + prevValues[slot] = pendingValue; + + // Rehash old keys, including the pending key. + rehash(prevKeys, prevValues); + } + + /** + * Shift all the slot-conflicting keys and values allocated to (and including) slot. + */ + protected void shiftConflictingKeys(int gapSlot) { + final char[] keys = this.keys; + final VType[] values = (VType[]) this.values; + final int mask = this.mask; + + // Perform shifts of conflicting keys to fill in the gap. + int distance = 0; + while (true) { + final int slot = (gapSlot + (++distance)) & mask; + final char existing = keys[slot]; + if (((existing) == 0)) { + break; + } + + final int idealSlot = hashKey(existing); + final int shift = (slot - idealSlot) & mask; + if (shift >= distance) { + // Entry at this position was originally at or before the gap slot. + // Move the conflict-shifted entry to the gap's position and repeat the procedure + // for any entries to the right of the current position, treating it + // as the new gap. + keys[gapSlot] = existing; + values[gapSlot] = values[slot]; + gapSlot = slot; + distance = 0; + } + } + + // Mark the last found gap slot without a conflict as empty. + keys[gapSlot] = 0; + values[gapSlot] = null; + assigned--; + } + + /** Forked from HPPC, holding int index,key and value */ + public static final class CharObjectCursor { + /** + * The current key and value's index in the container this cursor belongs to. The meaning of + * this index is defined by the container (usually it will be an index in the underlying storage + * buffer). + */ + public int index; + + /** The current key. */ + public char key; + + /** The current value. */ + public VType value; + + @Override + public String toString() { + return "[cursor, index: " + index + ", key: " + key + ", value: " + value + "]"; + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/util/hppc/HashContainers.java b/lucene/core/src/java/org/apache/lucene/util/hppc/HashContainers.java index 08fef9f9606..7859c457eb9 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hppc/HashContainers.java +++ b/lucene/core/src/java/org/apache/lucene/util/hppc/HashContainers.java @@ -22,26 +22,26 @@ import static org.apache.lucene.util.BitUtil.nextHighestPowerOfTwo; import java.util.concurrent.atomic.AtomicInteger; /** Constants for primitive maps. */ -public class HashContainers { +class HashContainers { - public static final int DEFAULT_EXPECTED_ELEMENTS = 4; + static final int DEFAULT_EXPECTED_ELEMENTS = 4; - public static final float DEFAULT_LOAD_FACTOR = 0.75f; + static final float DEFAULT_LOAD_FACTOR = 0.75f; /** Minimal sane load factor (99 empty slots per 100). */ - public static final float MIN_LOAD_FACTOR = 1 / 100.0f; + static final float MIN_LOAD_FACTOR = 1 / 100.0f; /** Maximum sane load factor (1 empty slot per 100). */ - public static final float MAX_LOAD_FACTOR = 99 / 100.0f; + static final float MAX_LOAD_FACTOR = 99 / 100.0f; /** Minimum hash buffer size. */ - public static final int MIN_HASH_ARRAY_LENGTH = 4; + static final int MIN_HASH_ARRAY_LENGTH = 4; /** * Maximum array size for hash containers (power-of-two and still allocable in Java, not a * negative int). */ - public static final int MAX_HASH_ARRAY_LENGTH = 0x80000000 >>> 1; + static final int MAX_HASH_ARRAY_LENGTH = 0x80000000 >>> 1; static final AtomicInteger ITERATION_SEED = new AtomicInteger(); diff --git a/lucene/core/src/java/org/apache/lucene/util/hppc/IntHashSet.java b/lucene/core/src/java/org/apache/lucene/util/hppc/IntHashSet.java index c2d72930a93..2b2213a9242 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hppc/IntHashSet.java +++ b/lucene/core/src/java/org/apache/lucene/util/hppc/IntHashSet.java @@ -152,6 +152,17 @@ public class IntHashSet implements Iterable, Accountable, Cloneable { return count; } + /** + * Adds all elements from the given set to this set. + * + * @return Returns the number of elements actually added as a result of this call (not previously + * present in the set). + */ + public int addAll(IntHashSet set) { + ensureCapacity(set.size()); + return addAll((Iterable) set); + } + /** * Adds all elements from the given iterable to this set. * diff --git a/lucene/core/src/java/org/apache/lucene/util/hppc/LongHashSet.java b/lucene/core/src/java/org/apache/lucene/util/hppc/LongHashSet.java index d131bfc7386..696c96083f5 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hppc/LongHashSet.java +++ b/lucene/core/src/java/org/apache/lucene/util/hppc/LongHashSet.java @@ -145,6 +145,17 @@ public class LongHashSet implements Iterable, Accountable, Cloneable return count; } + /** + * Adds all elements from the given set to this set. + * + * @return Returns the number of elements actually added as a result of this call (not previously + * present in the set). + */ + public int addAll(LongHashSet set) { + ensureCapacity(set.size()); + return addAll((Iterable) set); + } + /** * Adds all elements from the given iterable to this set. * diff --git a/lucene/core/src/test/org/apache/lucene/util/hppc/TestCharHashSet.java b/lucene/core/src/test/org/apache/lucene/util/hppc/TestCharHashSet.java new file mode 100644 index 00000000000..b98c7c854db --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/util/hppc/TestCharHashSet.java @@ -0,0 +1,473 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.util.hppc; + +import static org.hamcrest.Matchers.empty; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.greaterThanOrEqualTo; +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.lessThan; +import static org.hamcrest.Matchers.not; + +import com.carrotsearch.randomizedtesting.RandomizedTest; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Random; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.hamcrest.MatcherAssert; +import org.junit.Before; +import org.junit.Test; + +/** + * Tests for {@link CharHashSet}. + * + *

Mostly forked and trimmed from com.carrotsearch.hppc.CharHashSetTest + * + *

github: https://github.com/carrotsearch/hppc release: 0.9.0 + */ +public class TestCharHashSet extends LuceneTestCase { + private static final char EMPTY_KEY = (char) 0; + + private final char keyE = 0; + private final char key1 = cast(1); + private final char key2 = cast(2); + private final char key3 = cast(3); + private final char key4 = cast(4); + + /** Per-test fresh initialized instance. */ + private CharHashSet set; + + /** Convert to target type from an integer used to test stuff. */ + private static char cast(int v) { + return (char) ('a' + v); + } + + @Before + public void initialize() { + set = new CharHashSet(); + } + + @Test + public void testAddAllViaInterface() { + set.addAll(key1, key2); + + CharHashSet iface = new CharHashSet(); + iface.clear(); + iface.addAll(set); + MatcherAssert.assertThat(set(iface.toArray()), is(equalTo(set(key1, key2)))); + } + + @Test + public void testIndexMethods() { + set.add(keyE); + set.add(key1); + + MatcherAssert.assertThat(set.indexOf(keyE), is(greaterThanOrEqualTo(0))); + MatcherAssert.assertThat(set.indexOf(key1), is(greaterThanOrEqualTo(0))); + MatcherAssert.assertThat(set.indexOf(key2), is(lessThan(0))); + + MatcherAssert.assertThat(set.indexExists(set.indexOf(keyE)), is(true)); + MatcherAssert.assertThat(set.indexExists(set.indexOf(key1)), is(true)); + MatcherAssert.assertThat(set.indexExists(set.indexOf(key2)), is(false)); + + MatcherAssert.assertThat(set.indexGet(set.indexOf(keyE)), is(equalTo(keyE))); + MatcherAssert.assertThat(set.indexGet(set.indexOf(key1)), is(equalTo(key1))); + + expectThrows( + AssertionError.class, + () -> { + set.indexGet(set.indexOf(key2)); + }); + + MatcherAssert.assertThat(set.indexReplace(set.indexOf(keyE), keyE), is(equalTo(keyE))); + MatcherAssert.assertThat(set.indexReplace(set.indexOf(key1), key1), is(equalTo(key1))); + + set.indexInsert(set.indexOf(key2), key2); + MatcherAssert.assertThat(set.indexGet(set.indexOf(key2)), is(equalTo(key2))); + MatcherAssert.assertThat(set.size(), is(equalTo(3))); + + set.indexRemove(set.indexOf(keyE)); + MatcherAssert.assertThat(set.size(), is(equalTo(2))); + set.indexRemove(set.indexOf(key2)); + MatcherAssert.assertThat(set.size(), is(equalTo(1))); + MatcherAssert.assertThat(set.indexOf(keyE), is(lessThan(0))); + MatcherAssert.assertThat(set.indexOf(key1), is(greaterThanOrEqualTo(0))); + MatcherAssert.assertThat(set.indexOf(key2), is(lessThan(0))); + } + + @Test + public void testCursorIndexIsValid() { + set.add(keyE); + set.add(key1); + set.add(key2); + + for (CharCursor c : set) { + MatcherAssert.assertThat(set.indexExists(c.index), is(true)); + MatcherAssert.assertThat(set.indexGet(c.index), is(equalTo(c.value))); + } + } + + @Test + public void testEmptyKey() { + CharHashSet set = new CharHashSet(); + + boolean b = set.add(EMPTY_KEY); + + MatcherAssert.assertThat(b, is(true)); + MatcherAssert.assertThat(set.add(EMPTY_KEY), is(false)); + MatcherAssert.assertThat(set.size(), is(equalTo(1))); + MatcherAssert.assertThat(set.isEmpty(), is(false)); + MatcherAssert.assertThat(set(set.toArray()), is(equalTo(set(EMPTY_KEY)))); + MatcherAssert.assertThat(set.contains(EMPTY_KEY), is(true)); + int index = set.indexOf(EMPTY_KEY); + MatcherAssert.assertThat(set.indexExists(index), is(true)); + MatcherAssert.assertThat(set.indexGet(index), is(equalTo(EMPTY_KEY))); + MatcherAssert.assertThat(set.indexReplace(index, EMPTY_KEY), is(equalTo(EMPTY_KEY))); + + if (random().nextBoolean()) { + b = set.remove(EMPTY_KEY); + MatcherAssert.assertThat(b, is(true)); + } else { + set.indexRemove(index); + } + + MatcherAssert.assertThat(set.size(), is(equalTo(0))); + MatcherAssert.assertThat(set.isEmpty(), is(true)); + MatcherAssert.assertThat(set(set.toArray()), is(empty())); + MatcherAssert.assertThat(set.contains(EMPTY_KEY), is(false)); + index = set.indexOf(EMPTY_KEY); + MatcherAssert.assertThat(set.indexExists(index), is(false)); + + set.indexInsert(index, EMPTY_KEY); + set.add(key1); + MatcherAssert.assertThat(set.size(), is(equalTo(2))); + MatcherAssert.assertThat(set.contains(EMPTY_KEY), is(true)); + index = set.indexOf(EMPTY_KEY); + MatcherAssert.assertThat(set.indexExists(index), is(true)); + MatcherAssert.assertThat(set.indexGet(index), is(equalTo(EMPTY_KEY))); + } + + @Test + public void testEnsureCapacity() { + final AtomicInteger expands = new AtomicInteger(); + CharHashSet set = + new CharHashSet(0) { + @Override + protected void allocateBuffers(int arraySize) { + super.allocateBuffers(arraySize); + expands.incrementAndGet(); + } + }; + + // Add some elements. + final int max = rarely() ? 0 : randomIntBetween(0, 250); + for (int i = 0; i < max; i++) { + set.add(cast(i)); + } + + final int additions = randomIntBetween(max, max + 5000); + set.ensureCapacity(additions + set.size()); + final int before = expands.get(); + for (int i = 0; i < additions; i++) { + set.add(cast(i)); + } + assertEquals(before, expands.get()); + } + + @Test + public void testInitiallyEmpty() { + assertEquals(0, set.size()); + } + + @Test + public void testAdd() { + assertTrue(set.add(key1)); + assertFalse(set.add(key1)); + assertEquals(1, set.size()); + } + + @Test + public void testAdd2() { + set.addAll(key1, key1); + assertEquals(1, set.size()); + assertEquals(1, set.addAll(key1, key2)); + assertEquals(2, set.size()); + } + + @Test + public void testAddVarArgs() { + set.addAll(asArray(0, 1, 2, 1, 0)); + assertEquals(3, set.size()); + assertSortedListEquals(set.toArray(), asArray(0, 1, 2)); + } + + @Test + public void testAddAll() { + CharHashSet set2 = new CharHashSet(); + set2.addAll(asArray(1, 2)); + set.addAll(asArray(0, 1)); + + assertEquals(1, set.addAll(set2)); + assertEquals(0, set.addAll(set2)); + + assertEquals(3, set.size()); + assertSortedListEquals(set.toArray(), asArray(0, 1, 2)); + } + + @Test + public void testRemove() { + set.addAll(asArray(0, 1, 2, 3, 4)); + + assertTrue(set.remove(key2)); + assertFalse(set.remove(key2)); + assertEquals(4, set.size()); + assertSortedListEquals(set.toArray(), asArray(0, 1, 3, 4)); + } + + @Test + public void testInitialCapacityAndGrowth() { + for (int i = 0; i < 256; i++) { + CharHashSet set = new CharHashSet(i); + + for (int j = 0; j < i; j++) { + set.add(cast(j)); + } + + assertEquals(i, set.size()); + } + } + + @Test + public void testBug_HPPC73_FullCapacityGet() { + final AtomicInteger reallocations = new AtomicInteger(); + final int elements = 0x7F; + set = + new CharHashSet(elements, 1f) { + @Override + protected double verifyLoadFactor(double loadFactor) { + // Skip load factor sanity range checking. + return loadFactor; + } + + @Override + protected void allocateBuffers(int arraySize) { + super.allocateBuffers(arraySize); + reallocations.incrementAndGet(); + } + }; + + int reallocationsBefore = reallocations.get(); + assertEquals(reallocationsBefore, 1); + for (int i = 1; i <= elements; i++) { + set.add(cast(i)); + } + + // Non-existent key. + char outOfSet = cast(elements + 1); + set.remove(outOfSet); + assertFalse(set.contains(outOfSet)); + assertEquals(reallocationsBefore, reallocations.get()); + + // Should not expand because we're replacing an existing element. + assertFalse(set.add(key1)); + assertEquals(reallocationsBefore, reallocations.get()); + + // Remove from a full set. + set.remove(key1); + assertEquals(reallocationsBefore, reallocations.get()); + set.add(key1); + + // Check expand on "last slot of a full map" condition. + set.add(outOfSet); + assertEquals(reallocationsBefore + 1, reallocations.get()); + } + + @Test + public void testRemoveAllFromLookupContainer() { + set.addAll(asArray(0, 1, 2, 3, 4)); + + CharHashSet list2 = new CharHashSet(); + list2.addAll(asArray(1, 3, 5)); + + assertEquals(2, set.removeAll(list2)); + assertEquals(3, set.size()); + assertSortedListEquals(set.toArray(), asArray(0, 2, 4)); + } + + @Test + public void testClear() { + set.addAll(asArray(1, 2, 3)); + set.clear(); + assertEquals(0, set.size()); + } + + @Test + public void testRelease() { + set.addAll(asArray(1, 2, 3)); + set.release(); + assertEquals(0, set.size()); + set.addAll(asArray(1, 2, 3)); + assertEquals(3, set.size()); + } + + @Test + public void testIterable() { + set.addAll(asArray(1, 2, 2, 3, 4)); + set.remove(key2); + assertEquals(3, set.size()); + + int count = 0; + for (CharCursor cursor : set) { + count++; + assertTrue(set.contains(cursor.value)); + } + assertEquals(count, set.size()); + + set.clear(); + assertFalse(set.iterator().hasNext()); + } + + /** Runs random insertions/deletions/clearing and compares the results against {@link HashSet}. */ + @Test + @SuppressWarnings({"rawtypes", "unchecked"}) + public void testAgainstHashSet() { + final Random rnd = RandomizedTest.getRandom(); + final HashSet other = new HashSet(); + + for (int size = 1000; size < 20000; size += 4000) { + other.clear(); + set.clear(); + + for (int round = 0; round < size * 20; round++) { + char key = cast(rnd.nextInt(size)); + if (rnd.nextInt(50) == 0) { + key = EMPTY_KEY; + } + + if (rnd.nextBoolean()) { + if (rnd.nextBoolean()) { + int index = set.indexOf(key); + if (set.indexExists(index)) { + set.indexReplace(index, key); + } else { + set.indexInsert(index, key); + } + } else { + set.add(key); + } + other.add(key); + + assertTrue(set.contains(key)); + assertTrue(set.indexExists(set.indexOf(key))); + } else { + assertEquals(other.contains(key), set.contains(key)); + boolean removed; + if (set.contains(key) && rnd.nextBoolean()) { + set.indexRemove(set.indexOf(key)); + removed = true; + } else { + removed = set.remove(key); + } + assertEquals(other.remove(key), removed); + } + + assertEquals(other.size(), set.size()); + } + } + } + + @Test + public void testHashCodeEquals() { + CharHashSet l0 = new CharHashSet(); + assertEquals(0, l0.hashCode()); + assertEquals(l0, new CharHashSet()); + + CharHashSet l1 = CharHashSet.from(key1, key2, key3); + CharHashSet l2 = CharHashSet.from(key1, key2); + l2.add(key3); + + assertEquals(l1.hashCode(), l2.hashCode()); + assertEquals(l1, l2); + } + + @Test + public void testClone() { + this.set.addAll(asArray(1, 2, 3)); + + CharHashSet cloned = set.clone(); + cloned.remove(key1); + + assertSortedListEquals(set.toArray(), asArray(1, 2, 3)); + assertSortedListEquals(cloned.toArray(), asArray(2, 3)); + } + + @Test + public void testEqualsSameClass() { + CharHashSet l1 = CharHashSet.from(key1, key2, key3); + CharHashSet l2 = CharHashSet.from(key1, key2, key3); + CharHashSet l3 = CharHashSet.from(key1, key2, key4); + + MatcherAssert.assertThat(l1, is(equalTo(l2))); + MatcherAssert.assertThat(l1.hashCode(), is(equalTo(l2.hashCode()))); + MatcherAssert.assertThat(l1, is(not(equalTo(l3)))); + } + + @Test + public void testEqualsSubClass() { + class Sub extends CharHashSet {} + ; + + CharHashSet l1 = CharHashSet.from(key1, key2, key3); + CharHashSet l2 = new Sub(); + CharHashSet l3 = new Sub(); + l2.addAll(l1); + l3.addAll(l1); + + MatcherAssert.assertThat(l2, is(equalTo(l3))); + MatcherAssert.assertThat(l1, is(not(equalTo(l2)))); + } + + private static int randomIntBetween(int min, int max) { + return min + random().nextInt(max + 1 - min); + } + + private static Set set(char... elements) { + Set set = new HashSet<>(); + for (char element : elements) { + set.add(element); + } + return set; + } + + private static char[] asArray(int... elements) { + char[] result = new char[elements.length]; + for (int i = 0; i < elements.length; i++) { + result[i] = cast(elements[i]); + } + return result; + } + + /** Check if the array's content is identical to a given sequence of elements. */ + private static void assertSortedListEquals(char[] array, char[] elements) { + assertEquals(elements.length, array.length); + Arrays.sort(array); + assertArrayEquals(elements, array); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/util/hppc/TestCharObjectHashMap.java b/lucene/core/src/test/org/apache/lucene/util/hppc/TestCharObjectHashMap.java new file mode 100644 index 00000000000..1a3e58f51a4 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/util/hppc/TestCharObjectHashMap.java @@ -0,0 +1,671 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.util.hppc; + +import com.carrotsearch.randomizedtesting.RandomizedTest; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Random; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.junit.After; +import org.junit.Test; + +/** + * Tests for {@link CharObjectHashMap}. + * + *

Mostly forked and trimmed from com.carrotsearch.hppc.CharObjectHashMapTest + * + *

github: https://github.com/carrotsearch/hppc release: 0.9.0 + */ +@SuppressWarnings({"rawtypes", "unchecked"}) +public class TestCharObjectHashMap extends LuceneTestCase { + /* Ready to use key values. */ + + private final char keyE = 0; + private final char key1 = cast(1); + private final char key2 = cast(2); + private final char key3 = cast(3); + private final char key4 = cast(4); + + /** Convert to target type from an integer used to test stuff. */ + private char cast(int v) { + return (char) ('a' + v); + } + + /** Create a new array of a given type and copy the arguments to this array. */ + private char[] newArray(char... elements) { + return elements; + } + + private static int randomIntBetween(int min, int max) { + return min + random().nextInt(max + 1 - min); + } + + /** Check if the array's content is identical to a given sequence of elements. */ + private static void assertSortedListEquals(char[] array, char... elements) { + assertEquals(elements.length, array.length); + Arrays.sort(array); + Arrays.sort(elements); + assertArrayEquals(elements, array); + } + + /** Check if the array's content is identical to a given sequence of elements. */ + private static void assertSortedListEquals(Object[] array, Object... elements) { + assertEquals(elements.length, array.length); + Arrays.sort(array); + assertArrayEquals(elements, array); + } + + private final int value0 = vcast(0); + private final int value1 = vcast(1); + private final int value2 = vcast(2); + private final int value3 = vcast(3); + private final int value4 = vcast(4); + + /** Per-test fresh initialized instance. */ + private CharObjectHashMap map = newInstance(); + + private CharObjectHashMap newInstance() { + return new CharObjectHashMap(); + } + + @After + public void checkEmptySlotsUninitialized() { + if (map != null) { + int occupied = 0; + for (int i = 0; i <= map.mask; i++) { + if (((map.keys[i]) == 0)) { + + } else { + occupied++; + } + } + assertEquals(occupied, map.assigned); + + if (!map.hasEmptyKey) {} + } + } + + /** Convert to target type from an integer used to test stuff. */ + private int vcast(int value) { + return value; + } + + /** Create a new array of a given type and copy the arguments to this array. */ + /* */ + private Object[] newvArray(Object... elements) { + return elements; + } + + private void assertSameMap( + final CharObjectHashMap c1, final CharObjectHashMap c2) { + assertEquals(c1.size(), c2.size()); + + for (CharObjectHashMap.CharObjectCursor entry : c1) { + assertTrue(c2.containsKey(entry.key)); + assertEquals(entry.value, c2.get(entry.key)); + } + } + + /* */ + @Test + public void testEnsureCapacity() { + final AtomicInteger expands = new AtomicInteger(); + CharObjectHashMap map = + new CharObjectHashMap(0) { + @Override + protected void allocateBuffers(int arraySize) { + super.allocateBuffers(arraySize); + expands.incrementAndGet(); + } + }; + + // Add some elements. + final int max = rarely() ? 0 : randomIntBetween(0, 250); + for (int i = 0; i < max; i++) { + map.put(cast(i), value0); + } + + final int additions = randomIntBetween(max, max + 5000); + map.ensureCapacity(additions + map.size()); + final int before = expands.get(); + for (int i = 0; i < additions; i++) { + map.put(cast(i), value0); + } + assertEquals(before, expands.get()); + } + + @Test + public void testCursorIndexIsValid() { + map.put(keyE, value1); + map.put(key1, value2); + map.put(key2, value3); + + for (CharObjectHashMap.CharObjectCursor c : map) { + assertTrue(map.indexExists(c.index)); + assertEquals(c.value, map.indexGet(c.index)); + } + } + + @Test + public void testIndexMethods() { + map.put(keyE, value1); + map.put(key1, value2); + + assertTrue(map.indexOf(keyE) >= 0); + assertTrue(map.indexOf(key1) >= 0); + assertTrue(map.indexOf(key2) < 0); + + assertTrue(map.indexExists(map.indexOf(keyE))); + assertTrue(map.indexExists(map.indexOf(key1))); + assertFalse(map.indexExists(map.indexOf(key2))); + + assertEquals(value1, map.indexGet(map.indexOf(keyE))); + assertEquals(value2, map.indexGet(map.indexOf(key1))); + + expectThrows( + AssertionError.class, + () -> { + map.indexGet(map.indexOf(key2)); + }); + + assertEquals(value1, map.indexReplace(map.indexOf(keyE), value3)); + assertEquals(value2, map.indexReplace(map.indexOf(key1), value4)); + assertEquals(value3, map.indexGet(map.indexOf(keyE))); + assertEquals(value4, map.indexGet(map.indexOf(key1))); + + map.indexInsert(map.indexOf(key2), key2, value1); + assertEquals(value1, map.indexGet(map.indexOf(key2))); + assertEquals(3, map.size()); + + assertEquals(value3, map.indexRemove(map.indexOf(keyE))); + assertEquals(2, map.size()); + assertEquals(value1, map.indexRemove(map.indexOf(key2))); + assertEquals(1, map.size()); + assertTrue(map.indexOf(keyE) < 0); + assertTrue(map.indexOf(key1) >= 0); + assertTrue(map.indexOf(key2) < 0); + } + + /* */ + @Test + public void testCloningConstructor() { + map.put(key1, value1); + map.put(key2, value2); + map.put(key3, value3); + + assertSameMap(map, new CharObjectHashMap(map)); + } + + /* */ + @Test + public void testFromArrays() { + map.put(key1, value1); + map.put(key2, value2); + map.put(key3, value3); + + CharObjectHashMap map2 = + CharObjectHashMap.from(newArray(key1, key2, key3), newvArray(value1, value2, value3)); + + assertSameMap(map, map2); + } + + @Test + public void testGetOrDefault() { + map.put(key2, value2); + assertTrue(map.containsKey(key2)); + + map.put(key1, value1); + assertEquals(value1, map.getOrDefault(key1, value3)); + assertEquals(value3, map.getOrDefault(key3, value3)); + map.remove(key1); + assertEquals(value3, map.getOrDefault(key1, value3)); + } + + /* */ + @Test + public void testPut() { + map.put(key1, value1); + + assertTrue(map.containsKey(key1)); + assertEquals(value1, map.get(key1)); + } + + /* */ + @Test + public void testNullValue() { + map.put(key1, null); + + assertTrue(map.containsKey(key1)); + assertNull(map.get(key1)); + } + + @Test + public void testPutOverExistingKey() { + map.put(key1, value1); + assertEquals(value1, map.put(key1, value3)); + assertEquals(value3, map.get(key1)); + + assertEquals(value3, map.put(key1, null)); + assertTrue(map.containsKey(key1)); + assertNull(map.get(key1)); + + assertNull(map.put(key1, value1)); + assertEquals(value1, map.get(key1)); + } + + /* */ + @Test + public void testPutWithExpansions() { + final int COUNT = 10000; + final Random rnd = new Random(random().nextLong()); + final HashSet values = new HashSet(); + + for (int i = 0; i < COUNT; i++) { + final int v = rnd.nextInt(); + final boolean hadKey = values.contains(cast(v)); + values.add(cast(v)); + + assertEquals(hadKey, map.containsKey(cast(v))); + map.put(cast(v), vcast(v)); + assertEquals(values.size(), map.size()); + } + assertEquals(values.size(), map.size()); + } + + /* */ + @Test + public void testPutAll() { + map.put(key1, value1); + map.put(key2, value1); + + CharObjectHashMap map2 = newInstance(); + + map2.put(key2, value2); + map2.put(keyE, value1); + + // One new key (keyE). + assertEquals(1, map.putAll(map2)); + + // Assert the value under key2 has been replaced. + assertEquals(value2, map.get(key2)); + + // And key3 has been added. + assertEquals(value1, map.get(keyE)); + assertEquals(3, map.size()); + } + + /* */ + @Test + public void testPutIfAbsent() { + assertTrue(map.putIfAbsent(key1, value1)); + assertFalse(map.putIfAbsent(key1, value2)); + assertEquals(value1, map.get(key1)); + } + + /* */ + @Test + public void testRemove() { + map.put(key1, value1); + assertEquals(value1, map.remove(key1)); + assertEquals(null, map.remove(key1)); + assertEquals(0, map.size()); + + // These are internals, but perhaps worth asserting too. + assertEquals(0, map.assigned); + } + + /* */ + @Test + public void testEmptyKey() { + final char empty = 0; + + map.put(empty, value1); + assertEquals(1, map.size()); + assertEquals(false, map.isEmpty()); + assertEquals(value1, map.get(empty)); + assertEquals(value1, map.getOrDefault(empty, value2)); + assertEquals(true, map.iterator().hasNext()); + assertEquals(empty, map.iterator().next().key); + assertEquals(value1, map.iterator().next().value); + + map.remove(empty); + assertEquals(null, map.get(empty)); + assertEquals(0, map.size()); + + map.put(empty, null); + assertEquals(1, map.size()); + assertTrue(map.containsKey(empty)); + assertNull(map.get(empty)); + + map.remove(empty); + assertEquals(0, map.size()); + assertFalse(map.containsKey(empty)); + assertNull(map.get(empty)); + + assertEquals(null, map.put(empty, value1)); + assertEquals(value1, map.put(empty, value2)); + map.clear(); + assertFalse(map.indexExists(map.indexOf(empty))); + assertEquals(null, map.put(empty, value1)); + map.clear(); + assertEquals(null, map.remove(empty)); + } + + /* */ + @Test + public void testMapKeySet() { + map.put(key1, value3); + map.put(key2, value2); + map.put(key3, value1); + + assertSortedListEquals(map.keys().toArray(), key1, key2, key3); + } + + /* */ + @Test + public void testMapKeySetIterator() { + map.put(key1, value3); + map.put(key2, value2); + map.put(key3, value1); + + int counted = 0; + for (CharCursor c : map.keys()) { + assertEquals(map.keys[c.index], c.value); + counted++; + } + assertEquals(counted, map.size()); + } + + /* */ + @Test + public void testClear() { + map.put(key1, value1); + map.put(key2, value1); + map.clear(); + assertEquals(0, map.size()); + + // These are internals, but perhaps worth asserting too. + assertEquals(0, map.assigned); + + // Check values are cleared. + assertEquals(null, map.put(key1, value1)); + assertEquals(null, map.remove(key2)); + map.clear(); + + // Check if the map behaves properly upon subsequent use. + testPutWithExpansions(); + } + + /* */ + @Test + public void testRelease() { + map.put(key1, value1); + map.put(key2, value1); + map.release(); + assertEquals(0, map.size()); + + // These are internals, but perhaps worth asserting too. + assertEquals(0, map.assigned); + + // Check if the map behaves properly upon subsequent use. + testPutWithExpansions(); + } + + /* */ + @Test + public void testIterable() { + map.put(key1, value1); + map.put(key2, value2); + map.put(key3, value3); + map.remove(key2); + + int count = 0; + for (CharObjectHashMap.CharObjectCursor cursor : map) { + count++; + assertTrue(map.containsKey(cursor.key)); + assertEquals(cursor.value, map.get(cursor.key)); + + assertEquals(cursor.value, map.values[cursor.index]); + assertEquals(cursor.key, map.keys[cursor.index]); + } + assertEquals(count, map.size()); + + map.clear(); + assertFalse(map.iterator().hasNext()); + } + + /* */ + @Test + public void testBug_HPPC73_FullCapacityGet() { + final AtomicInteger reallocations = new AtomicInteger(); + final int elements = 0x7F; + map = + new CharObjectHashMap(elements, 1f) { + @Override + protected double verifyLoadFactor(double loadFactor) { + // Skip load factor sanity range checking. + return loadFactor; + } + + @Override + protected void allocateBuffers(int arraySize) { + super.allocateBuffers(arraySize); + reallocations.incrementAndGet(); + } + }; + + int reallocationsBefore = reallocations.get(); + assertEquals(reallocationsBefore, 1); + for (int i = 1; i <= elements; i++) { + map.put(cast(i), value1); + } + + // Non-existent key. + char outOfSet = cast(elements + 1); + map.remove(outOfSet); + assertFalse(map.containsKey(outOfSet)); + assertEquals(reallocationsBefore, reallocations.get()); + + // Should not expand because we're replacing an existing element. + map.put(key1, value2); + assertEquals(reallocationsBefore, reallocations.get()); + + // Remove from a full map. + map.remove(key1); + assertEquals(reallocationsBefore, reallocations.get()); + map.put(key1, value2); + + // Check expand on "last slot of a full map" condition. + map.put(outOfSet, value1); + assertEquals(reallocationsBefore + 1, reallocations.get()); + } + + @Test + public void testHashCodeEquals() { + CharObjectHashMap l0 = newInstance(); + assertEquals(0, l0.hashCode()); + assertEquals(l0, newInstance()); + + CharObjectHashMap l1 = + CharObjectHashMap.from(newArray(key1, key2, key3), newvArray(value1, value2, value3)); + + CharObjectHashMap l2 = + CharObjectHashMap.from(newArray(key2, key1, key3), newvArray(value2, value1, value3)); + + CharObjectHashMap l3 = CharObjectHashMap.from(newArray(key1, key2), newvArray(value2, value1)); + + assertEquals(l1.hashCode(), l2.hashCode()); + assertEquals(l1, l2); + + assertFalse(l1.equals(l3)); + assertFalse(l2.equals(l3)); + } + + @Test + public void testBug_HPPC37() { + CharObjectHashMap l1 = CharObjectHashMap.from(newArray(key1), newvArray(value1)); + + CharObjectHashMap l2 = CharObjectHashMap.from(newArray(key2), newvArray(value1)); + + assertFalse(l1.equals(l2)); + assertFalse(l2.equals(l1)); + } + + /** Runs random insertions/deletions/clearing and compares the results against {@link HashMap}. */ + @Test + @SuppressWarnings({"rawtypes", "unchecked"}) + public void testAgainstHashMap() { + final Random rnd = RandomizedTest.getRandom(); + final HashMap other = new HashMap(); + + for (int size = 1000; size < 20000; size += 4000) { + other.clear(); + map.clear(); + + for (int round = 0; round < size * 20; round++) { + char key = cast(rnd.nextInt(size)); + if (rnd.nextInt(50) == 0) { + key = 0; + } + + int value = vcast(rnd.nextInt()); + + if (rnd.nextBoolean()) { + Object previousValue; + if (rnd.nextBoolean()) { + int index = map.indexOf(key); + if (map.indexExists(index)) { + previousValue = map.indexReplace(index, value); + } else { + map.indexInsert(index, key, value); + previousValue = null; + } + } else { + previousValue = map.put(key, value); + } + assertEquals(other.put(key, value), previousValue); + + assertEquals(value, map.get(key)); + assertEquals(value, map.indexGet(map.indexOf(key))); + assertTrue(map.containsKey(key)); + assertTrue(map.indexExists(map.indexOf(key))); + } else { + assertEquals(other.containsKey(key), map.containsKey(key)); + Object previousValue = + map.containsKey(key) && rnd.nextBoolean() + ? map.indexRemove(map.indexOf(key)) + : map.remove(key); + assertEquals(other.remove(key), previousValue); + } + + assertEquals(other.size(), map.size()); + } + } + } + + /* + * + */ + @Test + public void testClone() { + this.map.put(key1, value1); + this.map.put(key2, value2); + this.map.put(key3, value3); + + CharObjectHashMap cloned = map.clone(); + cloned.remove(key1); + + assertSortedListEquals(map.keys().toArray(), key1, key2, key3); + assertSortedListEquals(cloned.keys().toArray(), key2, key3); + } + + /* */ + @Test + public void testMapValues() { + map.put(key1, value3); + map.put(key2, value2); + map.put(key3, value1); + assertSortedListEquals(map.values().toArray(), value1, value2, value3); + + map.clear(); + map.put(key1, value1); + map.put(key2, value2); + map.put(key3, value2); + assertSortedListEquals(map.values().toArray(), value1, value2, value2); + } + + /* */ + @Test + public void testMapValuesIterator() { + map.put(key1, value3); + map.put(key2, value2); + map.put(key3, value1); + + int counted = 0; + for (ObjectCursor c : map.values()) { + assertEquals(map.values[c.index], c.value); + counted++; + } + assertEquals(counted, map.size()); + } + + /* */ + @Test + public void testEqualsSameClass() { + CharObjectHashMap l1 = newInstance(); + l1.put(key1, value0); + l1.put(key2, value1); + l1.put(key3, value2); + + CharObjectHashMap l2 = new CharObjectHashMap(l1); + l2.putAll(l1); + + CharObjectHashMap l3 = new CharObjectHashMap(l2); + l3.putAll(l2); + l3.put(key4, value0); + + assertEquals(l2, l1); + assertEquals(l2.hashCode(), l1.hashCode()); + assertNotEquals(l1, l3); + } + + /* */ + @Test + public void testEqualsSubClass() { + class Sub extends CharObjectHashMap {} + + CharObjectHashMap l1 = newInstance(); + l1.put(key1, value0); + l1.put(key2, value1); + l1.put(key3, value2); + + CharObjectHashMap l2 = new Sub(); + l2.putAll(l1); + l2.put(key4, value3); + + CharObjectHashMap l3 = new Sub(); + l3.putAll(l2); + + assertNotEquals(l1, l2); + assertEquals(l3.hashCode(), l2.hashCode()); + assertEquals(l3, l2); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/util/hppc/TestIntHashSet.java b/lucene/core/src/test/org/apache/lucene/util/hppc/TestIntHashSet.java index b911e0d8a4f..2e76b18a6bc 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hppc/TestIntHashSet.java +++ b/lucene/core/src/test/org/apache/lucene/util/hppc/TestIntHashSet.java @@ -215,7 +215,7 @@ public class TestIntHashSet extends LuceneTestCase { public void testAddVarArgs() { set.addAll(asArray(0, 1, 2, 1, 0)); assertEquals(3, set.size()); - assertSortedListEquals(set.toArray(), 0, 1, 2); + assertSortedListEquals(set.toArray(), asArray(0, 1, 2)); } @Test @@ -228,7 +228,7 @@ public class TestIntHashSet extends LuceneTestCase { assertEquals(0, set.addAll(set2)); assertEquals(3, set.size()); - assertSortedListEquals(set.toArray(), 0, 1, 2); + assertSortedListEquals(set.toArray(), asArray(0, 1, 2)); } @Test @@ -238,7 +238,7 @@ public class TestIntHashSet extends LuceneTestCase { assertTrue(set.remove(key2)); assertFalse(set.remove(key2)); assertEquals(4, set.size()); - assertSortedListEquals(set.toArray(), 0, 1, 3, 4); + assertSortedListEquals(set.toArray(), asArray(0, 1, 3, 4)); } @Test @@ -308,7 +308,7 @@ public class TestIntHashSet extends LuceneTestCase { assertEquals(2, set.removeAll(list2)); assertEquals(3, set.size()); - assertSortedListEquals(set.toArray(), 0, 2, 4); + assertSortedListEquals(set.toArray(), asArray(0, 2, 4)); } @Test @@ -409,13 +409,13 @@ public class TestIntHashSet extends LuceneTestCase { @Test public void testClone() { - this.set.addAll(key1, key2, key3); + this.set.addAll(asArray(1, 2, 3)); IntHashSet cloned = set.clone(); cloned.remove(key1); - assertSortedListEquals(set.toArray(), key1, key2, key3); - assertSortedListEquals(cloned.toArray(), key2, key3); + assertSortedListEquals(set.toArray(), asArray(1, 2, 3)); + assertSortedListEquals(cloned.toArray(), asArray(2, 3)); } @Test @@ -461,7 +461,7 @@ public class TestIntHashSet extends LuceneTestCase { } /** Check if the array's content is identical to a given sequence of elements. */ - private static void assertSortedListEquals(int[] array, int... elements) { + private static void assertSortedListEquals(int[] array, int[] elements) { assertEquals(elements.length, array.length); Arrays.sort(array); assertArrayEquals(elements, array); diff --git a/lucene/core/src/test/org/apache/lucene/util/hppc/TestLongHashSet.java b/lucene/core/src/test/org/apache/lucene/util/hppc/TestLongHashSet.java index 14ea848ecb9..4c309cea8e3 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hppc/TestLongHashSet.java +++ b/lucene/core/src/test/org/apache/lucene/util/hppc/TestLongHashSet.java @@ -210,7 +210,7 @@ public class TestLongHashSet extends LuceneTestCase { public void testAddVarArgs() { set.addAll(asArray(0, 1, 2, 1, 0)); assertEquals(3, set.size()); - assertSortedListEquals(set.toArray(), 0, 1, 2); + assertSortedListEquals(set.toArray(), asArray(0, 1, 2)); } @Test @@ -223,7 +223,7 @@ public class TestLongHashSet extends LuceneTestCase { assertEquals(0, set.addAll(set2)); assertEquals(3, set.size()); - assertSortedListEquals(set.toArray(), 0, 1, 2); + assertSortedListEquals(set.toArray(), asArray(0, 1, 2)); } @Test @@ -233,7 +233,7 @@ public class TestLongHashSet extends LuceneTestCase { assertTrue(set.remove(key2)); assertFalse(set.remove(key2)); assertEquals(4, set.size()); - assertSortedListEquals(set.toArray(), 0, 1, 3, 4); + assertSortedListEquals(set.toArray(), asArray(0, 1, 3, 4)); } @Test @@ -303,7 +303,7 @@ public class TestLongHashSet extends LuceneTestCase { assertEquals(2, set.removeAll(list2)); assertEquals(3, set.size()); - assertSortedListEquals(set.toArray(), 0, 2, 4); + assertSortedListEquals(set.toArray(), asArray(0, 2, 4)); } @Test @@ -404,13 +404,13 @@ public class TestLongHashSet extends LuceneTestCase { @Test public void testClone() { - this.set.addAll(key1, key2, key3); + this.set.addAll(asArray(1, 2, 3)); LongHashSet cloned = set.clone(); cloned.remove(key1); - assertSortedListEquals(set.toArray(), key1, key2, key3); - assertSortedListEquals(cloned.toArray(), key2, key3); + assertSortedListEquals(set.toArray(), asArray(1, 2, 3)); + assertSortedListEquals(cloned.toArray(), asArray(2, 3)); } @Test @@ -456,7 +456,7 @@ public class TestLongHashSet extends LuceneTestCase { } /** Check if the array's content is identical to a given sequence of elements. */ - private static void assertSortedListEquals(long[] array, long... elements) { + private static void assertSortedListEquals(long[] array, long[] elements) { assertEquals(elements.length, array.length); Arrays.sort(array); assertArrayEquals(elements, array); diff --git a/lucene/core/src/test/org/apache/lucene/util/hppc/TestLongIntHashMap.java b/lucene/core/src/test/org/apache/lucene/util/hppc/TestLongIntHashMap.java index 4dddb081ea5..0d3468c4b5b 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hppc/TestLongIntHashMap.java +++ b/lucene/core/src/test/org/apache/lucene/util/hppc/TestLongIntHashMap.java @@ -331,7 +331,7 @@ public class TestLongIntHashMap extends LuceneTestCase { /* */ @Test public void testEmptyKey() { - final int empty = 0; + final long empty = 0; map.put(empty, value1); assertEquals(1, map.size()); diff --git a/lucene/core/src/test/org/apache/lucene/util/hppc/TestLongObjectHashMap.java b/lucene/core/src/test/org/apache/lucene/util/hppc/TestLongObjectHashMap.java index 7d368eed8d3..10b661c258a 100644 --- a/lucene/core/src/test/org/apache/lucene/util/hppc/TestLongObjectHashMap.java +++ b/lucene/core/src/test/org/apache/lucene/util/hppc/TestLongObjectHashMap.java @@ -335,7 +335,7 @@ public class TestLongObjectHashMap extends LuceneTestCase { /* */ @Test public void testEmptyKey() { - final int empty = 0; + final long empty = 0; map.put(empty, value1); assertEquals(1, map.size()); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java index e6c97779905..048582d6e43 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java @@ -150,9 +150,9 @@ public class StringValueFacetCounts extends Facets { List labelValues = new ArrayList<>(); if (sparseCounts != null) { - for (IntIntCursor cursor : sparseCounts) { - int count = cursor.value; - final BytesRef term = docValues.lookupOrd(cursor.key); + for (IntIntCursor sparseCount : sparseCounts) { + int count = sparseCount.value; + final BytesRef term = docValues.lookupOrd(sparseCount.key); labelValues.add(new LabelAndValue(term.utf8ToString(), count)); } } else { @@ -186,10 +186,10 @@ public class StringValueFacetCounts extends Facets { int childCount = 0; // total number of labels with non-zero count if (sparseCounts != null) { - for (IntIntCursor cursor : sparseCounts) { + for (IntIntCursor sparseCount : sparseCounts) { childCount++; // every count in sparseValues should be non-zero - int ord = cursor.key; - int count = cursor.value; + int ord = sparseCount.key; + int count = sparseCount.value; if (count > bottomCount || (count == bottomCount && ord < bottomOrd)) { if (q == null) { // Lazy init for sparse case: diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScanner.java b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScanner.java index e26d2f3d5a7..b46d7a5d08d 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScanner.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScanner.java @@ -16,9 +16,9 @@ */ package org.apache.lucene.search.vectorhighlight; -import java.util.Arrays; -import java.util.HashSet; +import java.util.Iterator; import java.util.Set; +import org.apache.lucene.util.hppc.CharHashSet; /** * Simple boundary scanner implementation that divides fragments based on a set of separator @@ -27,10 +27,10 @@ import java.util.Set; public class SimpleBoundaryScanner implements BoundaryScanner { public static final int DEFAULT_MAX_SCAN = 20; - public static final Character[] DEFAULT_BOUNDARY_CHARS = {'.', ',', '!', '?', ' ', '\t', '\n'}; + public static final char[] DEFAULT_BOUNDARY_CHARS = {'.', ',', '!', '?', ' ', '\t', '\n'}; protected int maxScan; - protected Set boundaryChars; + protected CharHashSet boundaryChars; public SimpleBoundaryScanner() { this(DEFAULT_MAX_SCAN, DEFAULT_BOUNDARY_CHARS); @@ -44,15 +44,34 @@ public class SimpleBoundaryScanner implements BoundaryScanner { this(DEFAULT_MAX_SCAN, boundaryChars); } - public SimpleBoundaryScanner(int maxScan, Character[] boundaryChars) { + public SimpleBoundaryScanner(int maxScan, char[] boundaryChars) { this.maxScan = maxScan; - this.boundaryChars = new HashSet<>(); - this.boundaryChars.addAll(Arrays.asList(boundaryChars)); + this.boundaryChars = CharHashSet.from(boundaryChars); + } + + public SimpleBoundaryScanner(int maxScan, Character[] boundaryChars) { + this(maxScan, toCharArray(boundaryChars)); } public SimpleBoundaryScanner(int maxScan, Set boundaryChars) { - this.maxScan = maxScan; - this.boundaryChars = boundaryChars; + this(maxScan, toCharArray(boundaryChars)); + } + + private static char[] toCharArray(Character[] characters) { + char[] chars = new char[characters.length]; + for (int i = 0; i < characters.length; i++) { + chars[i] = characters[i]; + } + return chars; + } + + private static char[] toCharArray(Set characters) { + Iterator iterator = characters.iterator(); + char[] chars = new char[characters.size()]; + for (int i = 0; i < chars.length; i++) { + chars[i] = iterator.next(); + } + return chars; } @Override