LUCENE-5447: StandardTokenizer should break at consecutive chars matching Word_Break = MidLetter, MidNum and/or MidNumLet

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1569586 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2014-02-19 01:21:24 +00:00
parent 09885620da
commit a904e73bf4
7 changed files with 8574 additions and 8308 deletions

View File

@ -250,6 +250,9 @@ Bug fixes
* LUCENE-5444: MemoryIndex did't respect the analyzers offset gap and * LUCENE-5444: MemoryIndex did't respect the analyzers offset gap and
offsets were corrupted if multiple fields with the same name were offsets were corrupted if multiple fields with the same name were
added to the memory index. (Britta Weber, Simon Willnauer) added to the memory index. (Britta Weber, Simon Willnauer)
* LUCENE-5447: StandardTokenizer should break at consecutive chars matching
Word_Break = MidLetter, MidNum and/or MidNumLet (Steve Rowe)
API Changes API Changes

View File

@ -157,10 +157,10 @@ RegionalIndicatorEx = {RegionalIndicator}
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet // WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana) // WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
// //
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )* {ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} ) | ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} )* {NumericEx} )* | {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} )* {HebrewOrALetterEx} )* | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
)+ )+
) )
({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )* ({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*

View File

@ -268,16 +268,16 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet // WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana) // WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
// //
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )* {ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} ) | ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} )* {NumericEx} )* | {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} )* {HebrewOrALetterEx} )* | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
)+ )+
) )
({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )* ({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} ) | ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} )* {NumericEx} )* | {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} )* {HebrewOrALetterEx} )* | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
)+ )+
) )
)* )*

View File

@ -230,8 +230,36 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
checkOneTerm(a, "壹゙", "壹゙"); // ideographic checkOneTerm(a, "壹゙", "壹゙"); // ideographic
checkOneTerm(a, "아゙", "아゙"); // hangul checkOneTerm(a, "아゙", "아゙"); // hangul
} }
/**
* Multiple consecutive chars in \p{WB:MidLetter}, \p{WB:MidNumLet},
* and/or \p{MidNum} should trigger a token split.
*/
public void testMid() throws Exception {
// ':' is in \p{WB:MidLetter}, which should trigger a split unless there is a Letter char on both sides
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:B", new String[] { "A:B" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A::B", new String[] { "A", "B" });
// '.' is in \p{WB:MidNumLet}, which should trigger a split unless there is a Letter or Numeric char on both sides
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.2", new String[] { "1.2" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.B", new String[] { "A.B" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1..2", new String[] { "1", "2" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A..B", new String[] { "A", "B" });
// ',' is in \p{WB:MidNum}, which should trigger a split unless there is a Numeric char on both sides
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,2", new String[] { "1,2" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,,2", new String[] { "1", "2" });
// Mixed consecutive \p{WB:MidLetter} and \p{WB:MidNumLet} should trigger a split
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.:B", new String[] { "A", "B" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:.B", new String[] { "A", "B" });
// Mixed consecutive \p{WB:MidNum} and \p{WB:MidNumLet} should trigger a split
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,.2", new String[] { "1", "2" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.,2", new String[] { "1", "2" });
}
/** blast some random strings through the analyzer */ /** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception { public void testRandomStrings() throws Exception {

View File

@ -453,6 +453,36 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
checkOneTerm(a, "아゙", "아゙"); // hangul checkOneTerm(a, "아゙", "아゙"); // hangul
} }
/**
* Multiple consecutive chars in \p{Word_Break = MidLetter},
* \p{Word_Break = MidNumLet}, and/or \p{Word_Break = MidNum}
* should trigger a token split.
*/
public void testMid() throws Exception {
// ':' is in \p{WB:MidLetter}, which should trigger a split unless there is a Letter char on both sides
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:B", new String[] { "A:B" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A::B", new String[] { "A", "B" });
// '.' is in \p{WB:MidNumLet}, which should trigger a split unless there is a Letter or Numeric char on both sides
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.2", new String[] { "1.2" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.B", new String[] { "A.B" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1..2", new String[] { "1", "2" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A..B", new String[] { "A", "B" });
// ',' is in \p{WB:MidNum}, which should trigger a split unless there is a Numeric char on both sides
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,2", new String[] { "1,2" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,,2", new String[] { "1", "2" });
// Mixed consecutive \p{WB:MidLetter} and \p{WB:MidNumLet} should trigger a split
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.:B", new String[] { "A", "B" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:.B", new String[] { "A", "B" });
// Mixed consecutive \p{WB:MidNum} and \p{WB:MidNumLet} should trigger a split
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,.2", new String[] { "1", "2" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.,2", new String[] { "1", "2" });
}
/** blast some random strings through the analyzer */ /** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception { public void testRandomStrings() throws Exception {
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER); checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);