From 5ccf063a5d635e60e0b1d755bb55fcec8b00c37e Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 31 Jan 2011 14:06:45 +0000 Subject: [PATCH] LUCENE-2901: fix consistency of KeywordMarkerFilter, it should only set, not unset the attribute git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1065621 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 4 ++-- .../miscellaneous/KeywordMarkerFilter.java | 8 +++++--- .../miscellaneous/TestKeywordMarkerFilter.java | 14 ++++++++++++++ 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 79ded77817e..cf9f02efb99 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -747,8 +747,8 @@ New features stopwords, and implement many analyzers in contrib with it. (Simon Willnauer via Robert Muir) -* LUCENE-2198: Support protected words in stemming TokenFilters using a - new KeywordAttribute. (Simon Willnauer via Uwe Schindler) +* LUCENE-2198, LUCENE-2901: Support protected words in stemming TokenFilters using a + new KeywordAttribute. (Simon Willnauer, Drew Farris via Uwe Schindler) * LUCENE-2183, LUCENE-2240, LUCENE-2241: Added Unicode 4 support to CharTokenizer and its subclasses. CharTokenizer now has new diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilter.java index b5fb812baca..7a55e32c53f 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilter.java @@ -74,10 +74,12 @@ public final class KeywordMarkerFilter extends TokenFilter { @Override public final boolean incrementToken() throws IOException { if (input.incrementToken()) { - keywordAttr.setKeyword(keywordSet.contains(termAtt.buffer(), 0, - termAtt.length())); + if (keywordSet.contains(termAtt.buffer(), 0, termAtt.length())) { + keywordAttr.setKeyword(true); + } return true; - } else + } else { return false; + } } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilter.java index f12e7c488c8..4637ee1210b 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilter.java @@ -2,6 +2,7 @@ package org.apache.lucene.analysis.miscellaneous; import java.io.IOException; import java.io.StringReader; +import java.util.Arrays; import java.util.HashSet; import java.util.Locale; import java.util.Set; @@ -57,6 +58,19 @@ public class TestKeywordMarkerFilter extends BaseTokenStreamTestCase { "The quIck browN LuceneFox Jumps")), set2)), output); } + // LUCENE-2901 + public void testComposition() throws Exception { + TokenStream ts = new LowerCaseFilterMock( + new KeywordMarkerFilter( + new KeywordMarkerFilter( + new WhitespaceTokenizer(TEST_VERSION_CURRENT, + new StringReader("Dogs Trees Birds Houses")), + new HashSet(Arrays.asList(new String[] { "Birds", "Houses" }))), + new HashSet(Arrays.asList(new String[] { "Dogs", "Trees" })))); + + assertTokenStreamContents(ts, new String[] { "Dogs", "Trees", "Birds", "Houses" }); + } + public static final class LowerCaseFilterMock extends TokenFilter { private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);