LUCENE-5447: StandardTokenizer should break at consecutive chars matching Word_Break = MidLetter, MidNum and/or MidNumLet

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1569586 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2014-02-19 01:21:24 +00:00
parent 09885620da
commit a904e73bf4
7 changed files with 8574 additions and 8308 deletions

View File

@ -250,6 +250,9 @@ Bug fixes
* LUCENE-5444: MemoryIndex did't respect the analyzers offset gap and
offsets were corrupted if multiple fields with the same name were
added to the memory index. (Britta Weber, Simon Willnauer)
* LUCENE-5447: StandardTokenizer should break at consecutive chars matching
Word_Break = MidLetter, MidNum and/or MidNumLet (Steve Rowe)
API Changes

View File

@ -157,10 +157,10 @@ RegionalIndicatorEx = {RegionalIndicator}
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
//
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} )* {NumericEx} )*
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} )* {HebrewOrALetterEx} )*
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
)+
)
({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*

View File

@ -268,16 +268,16 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
//
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} )* {NumericEx} )*
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} )* {HebrewOrALetterEx} )*
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
)+
)
({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} )* {NumericEx} )*
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} )* {HebrewOrALetterEx} )*
({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
)+
)
)*

View File

@ -230,8 +230,36 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
checkOneTerm(a, "壹゙", "壹゙"); // ideographic
checkOneTerm(a, "아゙", "아゙"); // hangul
}
/**
* Multiple consecutive chars in \p{WB:MidLetter}, \p{WB:MidNumLet},
* and/or \p{MidNum} should trigger a token split.
*/
public void testMid() throws Exception {
// ':' is in \p{WB:MidLetter}, which should trigger a split unless there is a Letter char on both sides
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:B", new String[] { "A:B" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A::B", new String[] { "A", "B" });
// '.' is in \p{WB:MidNumLet}, which should trigger a split unless there is a Letter or Numeric char on both sides
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.2", new String[] { "1.2" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.B", new String[] { "A.B" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1..2", new String[] { "1", "2" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A..B", new String[] { "A", "B" });
// ',' is in \p{WB:MidNum}, which should trigger a split unless there is a Numeric char on both sides
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,2", new String[] { "1,2" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,,2", new String[] { "1", "2" });
// Mixed consecutive \p{WB:MidLetter} and \p{WB:MidNumLet} should trigger a split
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.:B", new String[] { "A", "B" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:.B", new String[] { "A", "B" });
// Mixed consecutive \p{WB:MidNum} and \p{WB:MidNumLet} should trigger a split
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,.2", new String[] { "1", "2" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.,2", new String[] { "1", "2" });
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {

View File

@ -453,6 +453,36 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
checkOneTerm(a, "아゙", "아゙"); // hangul
}
/**
* Multiple consecutive chars in \p{Word_Break = MidLetter},
* \p{Word_Break = MidNumLet}, and/or \p{Word_Break = MidNum}
* should trigger a token split.
*/
public void testMid() throws Exception {
// ':' is in \p{WB:MidLetter}, which should trigger a split unless there is a Letter char on both sides
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:B", new String[] { "A:B" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A::B", new String[] { "A", "B" });
// '.' is in \p{WB:MidNumLet}, which should trigger a split unless there is a Letter or Numeric char on both sides
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.2", new String[] { "1.2" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.B", new String[] { "A.B" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1..2", new String[] { "1", "2" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A..B", new String[] { "A", "B" });
// ',' is in \p{WB:MidNum}, which should trigger a split unless there is a Numeric char on both sides
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,2", new String[] { "1,2" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,,2", new String[] { "1", "2" });
// Mixed consecutive \p{WB:MidLetter} and \p{WB:MidNumLet} should trigger a split
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.:B", new String[] { "A", "B" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:.B", new String[] { "A", "B" });
// Mixed consecutive \p{WB:MidNum} and \p{WB:MidNumLet} should trigger a split
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,.2", new String[] { "1", "2" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.,2", new String[] { "1", "2" });
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);