mirror of https://github.com/apache/lucene.git
LUCENE-5447: StandardTokenizer should break at consecutive chars matching Word_Break = MidLetter, MidNum and/or MidNumLet
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1569586 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
09885620da
commit
a904e73bf4
|
@ -250,6 +250,9 @@ Bug fixes
|
|||
* LUCENE-5444: MemoryIndex did't respect the analyzers offset gap and
|
||||
offsets were corrupted if multiple fields with the same name were
|
||||
added to the memory index. (Britta Weber, Simon Willnauer)
|
||||
|
||||
* LUCENE-5447: StandardTokenizer should break at consecutive chars matching
|
||||
Word_Break = MidLetter, MidNum and/or MidNumLet (Steve Rowe)
|
||||
|
||||
API Changes
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -157,10 +157,10 @@ RegionalIndicatorEx = {RegionalIndicator}
|
|||
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
|
||||
//
|
||||
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
|
||||
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
|
||||
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} )* {NumericEx} )*
|
||||
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} )* {HebrewOrALetterEx} )*
|
||||
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
|
||||
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
|
||||
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
|
||||
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
|
||||
)+
|
||||
)
|
||||
({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -268,16 +268,16 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
|||
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
|
||||
//
|
||||
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
|
||||
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
|
||||
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} )* {NumericEx} )*
|
||||
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} )* {HebrewOrALetterEx} )*
|
||||
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
|
||||
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
|
||||
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
|
||||
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
|
||||
)+
|
||||
)
|
||||
({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
|
||||
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
|
||||
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} )* {NumericEx} )*
|
||||
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} )* {HebrewOrALetterEx} )*
|
||||
({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
|
||||
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
|
||||
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
|
||||
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
|
||||
)+
|
||||
)
|
||||
)*
|
||||
|
|
|
@ -230,8 +230,36 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
|
|||
checkOneTerm(a, "壹゙", "壹゙"); // ideographic
|
||||
checkOneTerm(a, "아゙", "아゙"); // hangul
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Multiple consecutive chars in \p{WB:MidLetter}, \p{WB:MidNumLet},
|
||||
* and/or \p{MidNum} should trigger a token split.
|
||||
*/
|
||||
public void testMid() throws Exception {
|
||||
// ':' is in \p{WB:MidLetter}, which should trigger a split unless there is a Letter char on both sides
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:B", new String[] { "A:B" });
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A::B", new String[] { "A", "B" });
|
||||
|
||||
// '.' is in \p{WB:MidNumLet}, which should trigger a split unless there is a Letter or Numeric char on both sides
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.2", new String[] { "1.2" });
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.B", new String[] { "A.B" });
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1..2", new String[] { "1", "2" });
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A..B", new String[] { "A", "B" });
|
||||
|
||||
// ',' is in \p{WB:MidNum}, which should trigger a split unless there is a Numeric char on both sides
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,2", new String[] { "1,2" });
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,,2", new String[] { "1", "2" });
|
||||
|
||||
// Mixed consecutive \p{WB:MidLetter} and \p{WB:MidNumLet} should trigger a split
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.:B", new String[] { "A", "B" });
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:.B", new String[] { "A", "B" });
|
||||
|
||||
// Mixed consecutive \p{WB:MidNum} and \p{WB:MidNumLet} should trigger a split
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,.2", new String[] { "1", "2" });
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.,2", new String[] { "1", "2" });
|
||||
}
|
||||
|
||||
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
|
|
|
@ -453,6 +453,36 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
|||
checkOneTerm(a, "아゙", "아゙"); // hangul
|
||||
}
|
||||
|
||||
/**
|
||||
* Multiple consecutive chars in \p{Word_Break = MidLetter},
|
||||
* \p{Word_Break = MidNumLet}, and/or \p{Word_Break = MidNum}
|
||||
* should trigger a token split.
|
||||
*/
|
||||
public void testMid() throws Exception {
|
||||
// ':' is in \p{WB:MidLetter}, which should trigger a split unless there is a Letter char on both sides
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:B", new String[] { "A:B" });
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A::B", new String[] { "A", "B" });
|
||||
|
||||
// '.' is in \p{WB:MidNumLet}, which should trigger a split unless there is a Letter or Numeric char on both sides
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.2", new String[] { "1.2" });
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.B", new String[] { "A.B" });
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1..2", new String[] { "1", "2" });
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A..B", new String[] { "A", "B" });
|
||||
|
||||
// ',' is in \p{WB:MidNum}, which should trigger a split unless there is a Numeric char on both sides
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,2", new String[] { "1,2" });
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,,2", new String[] { "1", "2" });
|
||||
|
||||
// Mixed consecutive \p{WB:MidLetter} and \p{WB:MidNumLet} should trigger a split
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.:B", new String[] { "A", "B" });
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:.B", new String[] { "A", "B" });
|
||||
|
||||
// Mixed consecutive \p{WB:MidNum} and \p{WB:MidNumLet} should trigger a split
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,.2", new String[] { "1", "2" });
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.,2", new String[] { "1", "2" });
|
||||
}
|
||||
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||
|
|
Loading…
Reference in New Issue