mirror of https://github.com/apache/lucene.git
LUCENE-5447: StandardTokenizer should break at consecutive chars matching Word_Break = MidLetter, MidNum and/or MidNumLet
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1569586 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
09885620da
commit
a904e73bf4
|
@ -251,6 +251,9 @@ Bug fixes
|
||||||
offsets were corrupted if multiple fields with the same name were
|
offsets were corrupted if multiple fields with the same name were
|
||||||
added to the memory index. (Britta Weber, Simon Willnauer)
|
added to the memory index. (Britta Weber, Simon Willnauer)
|
||||||
|
|
||||||
|
* LUCENE-5447: StandardTokenizer should break at consecutive chars matching
|
||||||
|
Word_Break = MidLetter, MidNum and/or MidNumLet (Steve Rowe)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
* LUCENE-5339: The facet module was simplified/reworked to make the
|
* LUCENE-5339: The facet module was simplified/reworked to make the
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -159,8 +159,8 @@ RegionalIndicatorEx = {RegionalIndicator}
|
||||||
//
|
//
|
||||||
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
|
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
|
||||||
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
|
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
|
||||||
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} )* {NumericEx} )*
|
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
|
||||||
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} )* {HebrewOrALetterEx} )*
|
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
|
||||||
)+
|
)+
|
||||||
)
|
)
|
||||||
({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
|
({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -270,14 +270,14 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
||||||
//
|
//
|
||||||
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
|
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
|
||||||
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
|
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
|
||||||
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} )* {NumericEx} )*
|
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
|
||||||
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} )* {HebrewOrALetterEx} )*
|
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
|
||||||
)+
|
)+
|
||||||
)
|
)
|
||||||
({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
|
({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
|
||||||
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
|
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
|
||||||
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} )* {NumericEx} )*
|
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
|
||||||
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} )* {HebrewOrALetterEx} )*
|
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
|
||||||
)+
|
)+
|
||||||
)
|
)
|
||||||
)*
|
)*
|
||||||
|
|
|
@ -231,6 +231,34 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
|
||||||
checkOneTerm(a, "아゙", "아゙"); // hangul
|
checkOneTerm(a, "아゙", "아゙"); // hangul
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Multiple consecutive chars in \p{WB:MidLetter}, \p{WB:MidNumLet},
|
||||||
|
* and/or \p{MidNum} should trigger a token split.
|
||||||
|
*/
|
||||||
|
public void testMid() throws Exception {
|
||||||
|
// ':' is in \p{WB:MidLetter}, which should trigger a split unless there is a Letter char on both sides
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:B", new String[] { "A:B" });
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A::B", new String[] { "A", "B" });
|
||||||
|
|
||||||
|
// '.' is in \p{WB:MidNumLet}, which should trigger a split unless there is a Letter or Numeric char on both sides
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.2", new String[] { "1.2" });
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.B", new String[] { "A.B" });
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1..2", new String[] { "1", "2" });
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A..B", new String[] { "A", "B" });
|
||||||
|
|
||||||
|
// ',' is in \p{WB:MidNum}, which should trigger a split unless there is a Numeric char on both sides
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,2", new String[] { "1,2" });
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,,2", new String[] { "1", "2" });
|
||||||
|
|
||||||
|
// Mixed consecutive \p{WB:MidLetter} and \p{WB:MidNumLet} should trigger a split
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.:B", new String[] { "A", "B" });
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:.B", new String[] { "A", "B" });
|
||||||
|
|
||||||
|
// Mixed consecutive \p{WB:MidNum} and \p{WB:MidNumLet} should trigger a split
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,.2", new String[] { "1", "2" });
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.,2", new String[] { "1", "2" });
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
|
|
|
@ -453,6 +453,36 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
||||||
checkOneTerm(a, "아゙", "아゙"); // hangul
|
checkOneTerm(a, "아゙", "아゙"); // hangul
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Multiple consecutive chars in \p{Word_Break = MidLetter},
|
||||||
|
* \p{Word_Break = MidNumLet}, and/or \p{Word_Break = MidNum}
|
||||||
|
* should trigger a token split.
|
||||||
|
*/
|
||||||
|
public void testMid() throws Exception {
|
||||||
|
// ':' is in \p{WB:MidLetter}, which should trigger a split unless there is a Letter char on both sides
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:B", new String[] { "A:B" });
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A::B", new String[] { "A", "B" });
|
||||||
|
|
||||||
|
// '.' is in \p{WB:MidNumLet}, which should trigger a split unless there is a Letter or Numeric char on both sides
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.2", new String[] { "1.2" });
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.B", new String[] { "A.B" });
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1..2", new String[] { "1", "2" });
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A..B", new String[] { "A", "B" });
|
||||||
|
|
||||||
|
// ',' is in \p{WB:MidNum}, which should trigger a split unless there is a Numeric char on both sides
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,2", new String[] { "1,2" });
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,,2", new String[] { "1", "2" });
|
||||||
|
|
||||||
|
// Mixed consecutive \p{WB:MidLetter} and \p{WB:MidNumLet} should trigger a split
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.:B", new String[] { "A", "B" });
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:.B", new String[] { "A", "B" });
|
||||||
|
|
||||||
|
// Mixed consecutive \p{WB:MidNum} and \p{WB:MidNumLet} should trigger a split
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,.2", new String[] { "1", "2" });
|
||||||
|
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.,2", new String[] { "1", "2" });
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||||
|
|
Loading…
Reference in New Issue