From 2509d35c11d24a660b26b4975602161db82500e7 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Tue, 1 Mar 2011 15:47:14 +0000 Subject: [PATCH] LUCENE-2943: fix thread-safety issues with ICU collation git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1075850 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/contrib/CHANGES.txt | 3 + .../lucene/collation/CollationKeyFilter.java | 4 +- .../CollatedTermAttributeImpl.java | 4 +- .../lucene/collation/CollationTestBase.java | 78 +++++++++++++++++++ .../collation/TestCollationKeyAnalyzer.java | 10 +++ .../collation/ICUCollationKeyFilter.java | 7 +- .../ICUCollatedTermAttributeImpl.java | 7 +- .../TestICUCollationKeyAnalyzer.java | 10 +++ 8 files changed, 119 insertions(+), 4 deletions(-) diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index 7251f49788c..e3af7c72a7b 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -148,6 +148,9 @@ Bug fixes * LUCENE-2874: Highlighting overlapping tokens outputted doubled words. (Pierre Gossé via Robert Muir) + + * LUCENE-2943: Fix thread-safety issues with ICUCollationKeyFilter. + (Robert Muir) API Changes diff --git a/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java b/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java index 68d3e60e0e0..39ac69916c0 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java @@ -85,7 +85,9 @@ public final class CollationKeyFilter extends TokenFilter { */ public CollationKeyFilter(TokenStream input, Collator collator) { super(input); - this.collator = collator; + // clone in case JRE doesnt properly sync, + // or to reduce contention in case they do + this.collator = (Collator) collator.clone(); } @Override diff --git a/modules/analysis/common/src/java/org/apache/lucene/collation/tokenattributes/CollatedTermAttributeImpl.java b/modules/analysis/common/src/java/org/apache/lucene/collation/tokenattributes/CollatedTermAttributeImpl.java index 427793f2bc4..4e4d5d6cb29 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/collation/tokenattributes/CollatedTermAttributeImpl.java +++ b/modules/analysis/common/src/java/org/apache/lucene/collation/tokenattributes/CollatedTermAttributeImpl.java @@ -34,7 +34,9 @@ public class CollatedTermAttributeImpl extends CharTermAttributeImpl { * @param collator Collation key generator */ public CollatedTermAttributeImpl(Collator collator) { - this.collator = collator; + // clone in case JRE doesnt properly sync, + // or to reduce contention in case they do + this.collator = (Collator) collator.clone(); } @Override diff --git a/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java b/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java index f5379977188..949755d03fb 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java +++ b/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java @@ -21,6 +21,8 @@ package org.apache.lucene.collation; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; @@ -39,9 +41,12 @@ import org.apache.lucene.document.Document; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IndexableBinaryStringTools; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; import java.io.StringReader; import java.io.IOException; +import java.util.HashMap; +import java.util.Map; public abstract class CollationTestBase extends LuceneTestCase { @@ -252,4 +257,77 @@ public abstract class CollationTestBase extends LuceneTestCase { } assertEquals(expectedResult, buff.toString()); } + + private String randomString() { + // ideally we could do this! + // return _TestUtil.randomUnicodeString(random); + // + // http://bugs.icu-project.org/trac/ticket/8060 + // http://bugs.icu-project.org/trac/ticket/7732 + // ... + // + // as a workaround, just test the BMP for now (and avoid 0xFFFF etc) + int length = _TestUtil.nextInt(random, 0, 10); + char chars[] = new char[length]; + for (int i = 0; i < length; i++) { + if (random.nextBoolean()) { + chars[i] = (char) _TestUtil.nextInt(random, 0, 0xD7FF); + } else { + chars[i] = (char) _TestUtil.nextInt(random, 0xE000, 0xFFFD); + } + } + return new String(chars, 0, length); + } + + public void assertThreadSafe(final Analyzer analyzer) throws Exception { + int numTestPoints = 1000 * RANDOM_MULTIPLIER; + int numThreads = _TestUtil.nextInt(random, 4, 8); + final HashMap map = new HashMap(); + BytesRef spare = new BytesRef(); + + // create a map up front. + // then with multiple threads, generate sort keys for all the keys in the map + // and ensure they are the same as the ones we produced in serial fashion. + + for (int i = 0; i < numTestPoints; i++) { + String term = randomString(); + TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term)); + TermToBytesRefAttribute bytes = ts.addAttribute(TermToBytesRefAttribute.class); + ts.reset(); + assertTrue(ts.incrementToken()); + bytes.toBytesRef(spare); + // ensure we make a copy of the actual bytes too + map.put(term, new BytesRef(spare)); + } + + Thread threads[] = new Thread[numThreads]; + for (int i = 0; i < numThreads; i++) { + threads[i] = new Thread() { + @Override + public void run() { + try { + BytesRef spare = new BytesRef(); + for (Map.Entry mapping : map.entrySet()) { + String term = mapping.getKey(); + BytesRef expected = mapping.getValue(); + TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term)); + TermToBytesRefAttribute bytes = ts.addAttribute(TermToBytesRefAttribute.class); + ts.reset(); + assertTrue(ts.incrementToken()); + bytes.toBytesRef(spare); + assertEquals(expected, spare); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + }; + } + for (int i = 0; i < numThreads; i++) { + threads[i].start(); + } + for (int i = 0; i < numThreads; i++) { + threads[i].join(); + } + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java index e16badf168c..ba348d9fe9f 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java @@ -83,4 +83,14 @@ public class TestCollationKeyAnalyzer extends CollationTestBase { (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, oStrokeFirst ? "BFJHD" : "BFJDH", "EACGI", "BJDFH", "BJDHF"); } + + public void testThreadSafe() throws Exception { + int iters = 20 * RANDOM_MULTIPLIER; + for (int i = 0; i < iters; i++) { + Locale locale = randomLocale(random); + Collator collator = Collator.getInstance(locale); + collator.setStrength(Collator.PRIMARY); + assertThreadSafe(new CollationKeyAnalyzer(TEST_VERSION_CURRENT, collator)); + } + } } diff --git a/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java b/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java index a899ee57a99..c69aebecadd 100644 --- a/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java +++ b/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java @@ -84,7 +84,12 @@ public final class ICUCollationKeyFilter extends TokenFilter { */ public ICUCollationKeyFilter(TokenStream input, Collator collator) { super(input); - this.collator = collator; + // clone the collator: see http://userguide.icu-project.org/collation/architecture + try { + this.collator = (Collator) collator.clone(); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } } @Override diff --git a/modules/analysis/icu/src/java/org/apache/lucene/collation/tokenattributes/ICUCollatedTermAttributeImpl.java b/modules/analysis/icu/src/java/org/apache/lucene/collation/tokenattributes/ICUCollatedTermAttributeImpl.java index 5b97df62309..19962623f2b 100644 --- a/modules/analysis/icu/src/java/org/apache/lucene/collation/tokenattributes/ICUCollatedTermAttributeImpl.java +++ b/modules/analysis/icu/src/java/org/apache/lucene/collation/tokenattributes/ICUCollatedTermAttributeImpl.java @@ -36,7 +36,12 @@ public class ICUCollatedTermAttributeImpl extends CharTermAttributeImpl { * @param collator Collation key generator */ public ICUCollatedTermAttributeImpl(Collator collator) { - this.collator = collator; + // clone the collator: see http://userguide.icu-project.org/collation/architecture + try { + this.collator = (Collator) collator.clone(); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(e); + } } @Override diff --git a/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java b/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java index 33b21472c42..ad481ee74cb 100644 --- a/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java +++ b/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java @@ -84,4 +84,14 @@ public class TestICUCollationKeyAnalyzer extends CollationTestBase { (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, "BFJHD", "ECAGI", "BJDFH", "BJDHF"); } + + public void testThreadSafe() throws Exception { + int iters = 20 * RANDOM_MULTIPLIER; + for (int i = 0; i < iters; i++) { + Locale locale = randomLocale(random); + Collator collator = Collator.getInstance(locale); + collator.setStrength(Collator.IDENTICAL); + assertThreadSafe(new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, collator)); + } + } }