[OLINGO-568] More tests and fixes for Tokenizer

This commit is contained in:
mibo 2015-11-18 20:14:50 +01:00
parent 326e1775a7
commit ca7059c778
3 changed files with 122 additions and 52 deletions

View File

@ -65,14 +65,23 @@ public class SearchTokenizer {
} }
public State forbidden(char c) throws SearchTokenizerException { public State forbidden(char c) throws SearchTokenizerException {
throw new SearchTokenizerException("Forbidden character for " + this.getClass().getName() + "->" + c, throw new SearchTokenizerException("Forbidden character in state " + this.getToken() + "->" + c,
SearchTokenizerException.MessageKeys.FORBIDDEN_CHARACTER, "" + c); SearchTokenizerException.MessageKeys.FORBIDDEN_CHARACTER, "" + c);
} }
public State invalid() throws SearchTokenizerException {
throw new SearchTokenizerException("Token " + this.getToken() + " is in invalid state ",
SearchTokenizerException.MessageKeys.INVALID_TOKEN_STATE);
}
public State finish() { public State finish() {
this.finished = true; this.finished = true;
return this; return this;
} }
public State finishAs(Token token) {
this.finished = true;
return changeToken(token);
}
public boolean isFinished() { public boolean isFinished() {
return finished; return finished;
@ -86,6 +95,11 @@ public class SearchTokenizer {
return this; return this;
} }
protected State changeToken(Token token) {
this.token = token;
return this;
}
static boolean isAllowedWord(final char character) { static boolean isAllowedWord(final char character) {
// TODO mibo: add missing allowed characters // TODO mibo: add missing allowed characters
int type = Character.getType(character); int type = Character.getType(character);
@ -240,7 +254,7 @@ public class SearchTokenizer {
@Override @Override
public String toString() { public String toString() {
return this.getToken().toString() + "=>{" + getLiteral() + "}"; return this.getToken() + "=>{" + getLiteral() + "}";
} }
} }
@ -360,6 +374,21 @@ public class SearchTokenizer {
return forbidden(c); return forbidden(c);
} }
@Override
public State finish() {
String tmpLiteral = literal.toString();
if(tmpLiteral.length() == 3) {
if(Token.AND.name().equals(tmpLiteral)) {
return finishAs(Token.AND);
} else if(Token.NOT.name().equals(tmpLiteral)) {
return finishAs(Token.NOT);
}
} else if(tmpLiteral.length() == 2 && Token.OR.name().equals(tmpLiteral)) {
return finishAs(Token.OR);
}
return super.finish();
}
@Override @Override
public State close() { public State close() {
return finish(); return finish();
@ -367,6 +396,7 @@ public class SearchTokenizer {
} }
private class SearchPhraseState extends LiteralState { private class SearchPhraseState extends LiteralState {
private boolean closed = false;
public SearchPhraseState(char c) throws SearchTokenizerException { public SearchPhraseState(char c) throws SearchTokenizerException {
super(Token.PHRASE, c); super(Token.PHRASE, c);
if (c != QUOTATION_MARK) { if (c != QUOTATION_MARK) {
@ -376,19 +406,34 @@ public class SearchTokenizer {
@Override @Override
public State nextChar(char c) throws SearchTokenizerException { public State nextChar(char c) throws SearchTokenizerException {
if (isAllowedPhrase(c)) { if(closed) {
finish();
if (c == CHAR_CLOSE) {
return new CloseState();
} else if (isWhitespace(c)) {
return new RwsState();
}
} else if (isAllowedPhrase(c)) {
return allowed(c); return allowed(c);
} else if (isWhitespace(c)) { } else if (isWhitespace(c)) {
return allowed(c); return allowed(c);
} else if (c == QUOTATION_MARK) { } else if (c == QUOTATION_MARK) {
finish(); if(literal.length() == 1) {
allowed(c); return invalid();
return new SearchExpressionState(); }
} else if (isFinished()) { closed = true;
return new SearchExpressionState().init(c); return allowed(c);
} }
return forbidden(c); return forbidden(c);
} }
@Override
public State close() {
if(closed) {
return finish();
}
return super.close();
}
} }
private class OpenState extends State { private class OpenState extends State {
@ -564,6 +609,9 @@ public class SearchTokenizer {
if (state.close().isFinished()) { if (state.close().isFinished()) {
states.add(state); states.add(state);
} else {
throw new SearchTokenizerException("Last parsed state '" + state.toString() + "' is not finished.",
SearchTokenizerException.MessageKeys.NOT_FINISHED_QUERY);
} }
return states; return states;

View File

@ -24,9 +24,16 @@ public class SearchTokenizerException extends UriParserSyntaxException {
private static final long serialVersionUID = -8295456415309640166L; private static final long serialVersionUID = -8295456415309640166L;
public static enum MessageKeys implements MessageKey { public enum MessageKeys implements MessageKey {
/** parameter: character */ /** parameter: character */
FORBIDDEN_CHARACTER, FORBIDDEN_CHARACTER,
/** parameter: TOKEN */
NOT_EXPECTED_TOKEN,
/** parameter: - */
NOT_FINISHED_QUERY,
/** parameter: - */
INVALID_TOKEN_STATE,
/** parameter: - */
ALREADY_FINISHED; ALREADY_FINISHED;
@Override @Override

View File

@ -91,7 +91,7 @@ public class SearchTokenizerTest {
SearchTokenizer tokenizer = new SearchTokenizer(); SearchTokenizer tokenizer = new SearchTokenizer();
List<SearchQueryToken> result; List<SearchQueryToken> result;
SearchValidator.init("abc AND \"x-y_z\" AND olingo").validate(); TokenizerValidator.init("abc AND \"x-y_z\" AND olingo").validate();
// //
result = tokenizer.tokenize("\"abc\""); result = tokenizer.tokenize("\"abc\"");
@ -113,7 +113,7 @@ public class SearchTokenizerTest {
Assert.assertEquals(PHRASE, result.get(0).getToken()); Assert.assertEquals(PHRASE, result.get(0).getToken());
Assert.assertEquals("\"99_88.\"", result.get(0).getLiteral()); Assert.assertEquals("\"99_88.\"", result.get(0).getLiteral());
SearchValidator.init("abc or \"xyz\"").addExpected(WORD, WORD, PHRASE).validate(); TokenizerValidator.init("abc or \"xyz\"").validate(WORD, WORD, PHRASE);
} }
/** /**
@ -124,22 +124,22 @@ public class SearchTokenizerTest {
@Ignore("Test must be moved to SearchParserTest and SearchParserAndTokenizerTest") @Ignore("Test must be moved to SearchParserTest and SearchParserAndTokenizerTest")
public void parsePhraseAbnfTestcases() throws Exception { public void parsePhraseAbnfTestcases() throws Exception {
// <TestCase Name="5.1.7 Search - simple phrase" Rule="queryOptions"> // <TestCase Name="5.1.7 Search - simple phrase" Rule="queryOptions">
SearchValidator.init("\"blue%20green\"").validate(); TokenizerValidator.init("\"blue%20green\"").validate();
// <TestCase Name="5.1.7 Search - simple phrase" Rule="queryOptions"> // <TestCase Name="5.1.7 Search - simple phrase" Rule="queryOptions">
SearchValidator.init("\"blue%20green%22").validate(); TokenizerValidator.init("\"blue%20green%22").validate();
// <TestCase Name="5.1.7 Search - phrase with escaped double-quote" Rule="queryOptions"> // <TestCase Name="5.1.7 Search - phrase with escaped double-quote" Rule="queryOptions">
// <Input>$search="blue\"green"</Input> // <Input>$search="blue\"green"</Input>
SearchValidator.init("\"blue\\\"green\"").validate(); TokenizerValidator.init("\"blue\\\"green\"").validate();
// <TestCase Name="5.1.7 Search - phrase with escaped backslash" Rule="queryOptions"> // <TestCase Name="5.1.7 Search - phrase with escaped backslash" Rule="queryOptions">
// <Input>$search="blue\\green"</Input> // <Input>$search="blue\\green"</Input>
SearchValidator.init("\"blue\\\\green\"").validate(); TokenizerValidator.init("\"blue\\\\green\"").validate();
// <TestCase Name="5.1.7 Search - phrase with unescaped double-quote" Rule="queryOptions" FailAt="14"> // <TestCase Name="5.1.7 Search - phrase with unescaped double-quote" Rule="queryOptions" FailAt="14">
SearchValidator.init("\"blue\"green\"").validate(); TokenizerValidator.init("\"blue\"green\"").validate();
// <TestCase Name="5.1.7 Search - phrase with unescaped double-quote" Rule="queryOptions" FailAt="16"> // <TestCase Name="5.1.7 Search - phrase with unescaped double-quote" Rule="queryOptions" FailAt="16">
SearchValidator.init("\"blue%22green\"").validate(); TokenizerValidator.init("\"blue%22green\"").validate();
// <TestCase Name="5.1.7 Search - implicit AND" Rule="queryOptions"> // <TestCase Name="5.1.7 Search - implicit AND" Rule="queryOptions">
// <Input>$search=blue green</Input> // <Input>$search=blue green</Input>
@ -160,10 +160,10 @@ public class SearchTokenizerTest {
Assert.assertEquals(NOT, result.get(0).getToken()); Assert.assertEquals(NOT, result.get(0).getToken());
Assert.assertEquals(WORD, result.get(1).getToken()); Assert.assertEquals(WORD, result.get(1).getToken());
SearchValidator.init("not abc").addExpected(WORD, WORD).validate(); TokenizerValidator.init("not abc").addExpected(WORD, WORD).validate();
SearchValidator.init("NOT abc").addExpected(NOT, WORD).validate(); TokenizerValidator.init("NOT abc").addExpected(NOT, WORD).validate();
SearchValidator.init("NOT \"abc\"").addExpected(NOT, PHRASE).validate(); TokenizerValidator.init("NOT \"abc\"").addExpected(NOT, PHRASE).validate();
SearchValidator.init("NOT (sdf)").validate(SearchTokenizerException.class); TokenizerValidator.init("NOT (sdf)").validate(SearchTokenizerException.class);
} }
@Test @Test
@ -187,16 +187,16 @@ public class SearchTokenizerTest {
Assert.assertEquals(OR, result.get(3).getToken()); Assert.assertEquals(OR, result.get(3).getToken());
Assert.assertEquals(WORD, result.get(4).getToken()); Assert.assertEquals(WORD, result.get(4).getToken());
SearchValidator.init("abc or xyz").addExpected(WORD, WORD, WORD).validate(); TokenizerValidator.init("abc or xyz").addExpected(WORD, WORD, WORD).validate();
} }
@Test @Test
public void parseImplicitAnd() throws SearchTokenizerException { public void parseImplicitAnd() throws SearchTokenizerException {
SearchValidator.init("a b").addExpected(WORD, WORD).validate(); TokenizerValidator.init("a b").addExpected(WORD, WORD).validate();
SearchValidator.init("a b OR c").addExpected(WORD, WORD, OR, WORD).validate(); TokenizerValidator.init("a b OR c").addExpected(WORD, WORD, OR, WORD).validate();
SearchValidator.init("a bc OR c").addExpected(WORD, WORD, OR, WORD).validate(); TokenizerValidator.init("a bc OR c").addExpected(WORD, WORD, OR, WORD).validate();
SearchValidator.init("a bc c").addExpected(WORD, WORD, WORD).validate(); TokenizerValidator.init("a bc c").addExpected(WORD, WORD, WORD).validate();
SearchValidator.init("(a OR x) bc c").addExpected(OPEN, WORD, OR, WORD, CLOSE, WORD, WORD).validate(); TokenizerValidator.init("(a OR x) bc c").addExpected(OPEN, WORD, OR, WORD, CLOSE, WORD, WORD).validate();
} }
@Test @Test
@ -261,7 +261,7 @@ public class SearchTokenizerTest {
Assert.assertEquals(OR, result.get(3).getToken()); Assert.assertEquals(OR, result.get(3).getToken());
Assert.assertEquals(WORD, result.get(4).getToken()); Assert.assertEquals(WORD, result.get(4).getToken());
SearchValidator.init("abc AND ANDsomething") TokenizerValidator.init("abc AND ANDsomething")
.addExpected(WORD, AND, WORD).validate(); .addExpected(WORD, AND, WORD).validate();
} }
@ -282,7 +282,7 @@ public class SearchTokenizerTest {
Assert.assertEquals(OR, it.next().getToken()); Assert.assertEquals(OR, it.next().getToken());
Assert.assertEquals(WORD, it.next().getToken()); Assert.assertEquals(WORD, it.next().getToken());
SearchValidator.init("foo AND bar OR foo AND baz OR that AND bar OR that AND baz") TokenizerValidator.init("foo AND bar OR foo AND baz OR that AND bar OR that AND baz")
.addExpected(WORD, "foo").addExpected(AND) .addExpected(WORD, "foo").addExpected(AND)
.addExpected(WORD, "bar").addExpected(OR) .addExpected(WORD, "bar").addExpected(OR)
.addExpected(WORD, "foo").addExpected(AND) .addExpected(WORD, "foo").addExpected(AND)
@ -294,7 +294,7 @@ public class SearchTokenizerTest {
.validate(); .validate();
SearchValidator.init("(foo OR that) AND (bar OR baz)") TokenizerValidator.init("(foo OR that) AND (bar OR baz)")
.addExpected(OPEN) .addExpected(OPEN)
.addExpected(WORD, "foo").addExpected(OR).addExpected(WORD, "that") .addExpected(WORD, "foo").addExpected(OR).addExpected(WORD, "that")
.addExpected(CLOSE).addExpected(AND).addExpected(OPEN) .addExpected(CLOSE).addExpected(AND).addExpected(OPEN)
@ -325,19 +325,19 @@ public class SearchTokenizerTest {
Assert.assertEquals(AND, it.next().getToken()); Assert.assertEquals(AND, it.next().getToken());
Assert.assertEquals(WORD, it.next().getToken()); Assert.assertEquals(WORD, it.next().getToken());
SearchValidator.init("abc AND ANDsomething") TokenizerValidator.init("abc AND ANDsomething")
.addExpected(WORD, AND, WORD).validate(); .addExpected(WORD, AND, WORD).validate();
SearchValidator.init("abc ANDsomething") TokenizerValidator.init("abc ANDsomething")
.addExpected(WORD, WORD).validate(); .addExpected(WORD, WORD).validate();
SearchValidator.init("abc ORsomething") TokenizerValidator.init("abc ORsomething")
.addExpected(WORD, WORD).validate(); .addExpected(WORD, WORD).validate();
SearchValidator.init("abc OR orsomething") TokenizerValidator.init("abc OR orsomething")
.addExpected(WORD, OR, WORD).validate(); .addExpected(WORD, OR, WORD).validate();
SearchValidator.init("abc OR ORsomething") TokenizerValidator.init("abc OR ORsomething")
.addExpected(WORD, OR, WORD).validate(); .addExpected(WORD, OR, WORD).validate();
} }
@ -345,7 +345,7 @@ public class SearchTokenizerTest {
@Test @Test
public void unicodeInWords() throws Exception { public void unicodeInWords() throws Exception {
// Ll, Lm, Lo, Lt, Lu, Nl // Ll, Lm, Lo, Lt, Lu, Nl
SearchValidator.init("abc OR Ll\u01E3Lm\u02B5Lo\u1BE4Lt\u01F2Lu\u03D3Nl\u216F") TokenizerValidator.init("abc OR Ll\u01E3Lm\u02B5Lo\u1BE4Lt\u01F2Lu\u03D3Nl\u216F")
.addExpected(WORD, OR, WORD).validate(); .addExpected(WORD, OR, WORD).validate();
} }
@ -369,7 +369,7 @@ public class SearchTokenizerTest {
*/ */
@Test @Test
public void characterInPhrase() throws Exception { public void characterInPhrase() throws Exception {
SearchValidator.init("\"123\" OR \"ALPHA-._~\"") TokenizerValidator.init("\"123\" OR \"ALPHA-._~\"")
.addExpected(PHRASE, OR, PHRASE).validate(); .addExpected(PHRASE, OR, PHRASE).validate();
} }
@ -395,7 +395,7 @@ public class SearchTokenizerTest {
validate("abc def ghi"); validate("abc def ghi");
// mixed not // mixed not
SearchValidator.init(" abc def AND ghi").validate(WORD, WORD, AND, WORD); TokenizerValidator.init(" abc def AND ghi").validate(WORD, WORD, AND, WORD);
validate("NOT abc NOT def OR NOT ghi", NOT, WORD, NOT, WORD, OR, NOT, WORD); validate("NOT abc NOT def OR NOT ghi", NOT, WORD, NOT, WORD, OR, NOT, WORD);
validate(" abc def NOT ghi", WORD, WORD, NOT, WORD); validate(" abc def NOT ghi", WORD, WORD, NOT, WORD);
@ -409,26 +409,41 @@ public class SearchTokenizerTest {
} }
@Test @Test
public void parseInvalid() throws SearchTokenizerException { public void tokenizeInvalid() throws SearchTokenizerException {
SearchValidator.init("abc AND OR something").validate();
SearchValidator.init("abc AND \"something\" )").validate();
// //
SearchValidator.init("( abc AND) OR something").validate(SearchTokenizerException.class); TokenizerValidator.init("( abc AND) OR something").validate(SearchTokenizerException.class);
TokenizerValidator.init("\"phrase\"word").validate(SearchTokenizerException.class);
TokenizerValidator.init("\"p\"w").validate(SearchTokenizerException.class);
TokenizerValidator.init("\"\"").validate(SearchTokenizerException.class);
}
@Test
public void tokenizeInvalidQueryForParser() throws SearchTokenizerException {
// TokenizerValidator.init("NOT").validate(NOT);
TokenizerValidator.init("AND").validate(AND);
TokenizerValidator.init("OR").validate(OR);
TokenizerValidator.init("NOT AND").validate(NOT, AND);
TokenizerValidator.init("NOT OR").validate(NOT, OR);
TokenizerValidator.init("NOT NOT").validate(NOT, NOT);
TokenizerValidator.init("abc AND OR something").validate(WORD, AND, OR, WORD);
TokenizerValidator.init("abc AND \"something\" )").validate(WORD, AND, PHRASE, CLOSE);
} }
public void validate(String query) throws SearchTokenizerException { public void validate(String query) throws SearchTokenizerException {
new SearchValidator(query).validate(); new TokenizerValidator(query).validate();
} }
public void validate(String query, SearchQueryToken.Token ... tokens) throws SearchTokenizerException { public void validate(String query, SearchQueryToken.Token ... tokens) throws SearchTokenizerException {
SearchValidator sv = new SearchValidator(query); TokenizerValidator sv = new TokenizerValidator(query);
for (SearchQueryToken.Token token : tokens) { for (SearchQueryToken.Token token : tokens) {
sv.addExpected(token); sv.addExpected(token);
} }
sv.validate(); sv.validate();
} }
private static class SearchValidator { private static class TokenizerValidator {
private List<Tuple> validations = new ArrayList<Tuple>(); private List<Tuple> validations = new ArrayList<Tuple>();
private boolean log; private boolean log;
private final String searchQuery; private final String searchQuery;
@ -450,24 +465,24 @@ public class SearchTokenizerTest {
} }
} }
private SearchValidator(String searchQuery) { private TokenizerValidator(String searchQuery) {
this.searchQuery = searchQuery; this.searchQuery = searchQuery;
} }
private static SearchValidator init(String searchQuery) { private static TokenizerValidator init(String searchQuery) {
return new SearchValidator(searchQuery); return new TokenizerValidator(searchQuery);
} }
@SuppressWarnings("unused") @SuppressWarnings("unused")
private SearchValidator enableLogging() { private TokenizerValidator enableLogging() {
log = true; log = true;
return this; return this;
} }
private SearchValidator addExpected(SearchQueryToken.Token token, String literal) { private TokenizerValidator addExpected(SearchQueryToken.Token token, String literal) {
validations.add(new Tuple(token, literal)); validations.add(new Tuple(token, literal));
return this; return this;
} }
private SearchValidator addExpected(SearchQueryToken.Token ... token) { private TokenizerValidator addExpected(SearchQueryToken.Token ... token) {
for (SearchQueryToken.Token t : token) { for (SearchQueryToken.Token t : token) {
validations.add(new Tuple(t)); validations.add(new Tuple(t));
} }