[OLINGO-568] More tests and fixes for Tokenizer

This commit is contained in:
mibo 2015-11-18 20:14:50 +01:00
parent 326e1775a7
commit ca7059c778
3 changed files with 122 additions and 52 deletions

View File

@ -65,14 +65,23 @@ public class SearchTokenizer {
}
public State forbidden(char c) throws SearchTokenizerException {
throw new SearchTokenizerException("Forbidden character for " + this.getClass().getName() + "->" + c,
throw new SearchTokenizerException("Forbidden character in state " + this.getToken() + "->" + c,
SearchTokenizerException.MessageKeys.FORBIDDEN_CHARACTER, "" + c);
}
public State invalid() throws SearchTokenizerException {
throw new SearchTokenizerException("Token " + this.getToken() + " is in invalid state ",
SearchTokenizerException.MessageKeys.INVALID_TOKEN_STATE);
}
public State finish() {
this.finished = true;
return this;
}
public State finishAs(Token token) {
this.finished = true;
return changeToken(token);
}
public boolean isFinished() {
return finished;
@ -86,6 +95,11 @@ public class SearchTokenizer {
return this;
}
protected State changeToken(Token token) {
this.token = token;
return this;
}
static boolean isAllowedWord(final char character) {
// TODO mibo: add missing allowed characters
int type = Character.getType(character);
@ -240,7 +254,7 @@ public class SearchTokenizer {
@Override
public String toString() {
return this.getToken().toString() + "=>{" + getLiteral() + "}";
return this.getToken() + "=>{" + getLiteral() + "}";
}
}
@ -360,6 +374,21 @@ public class SearchTokenizer {
return forbidden(c);
}
@Override
public State finish() {
String tmpLiteral = literal.toString();
if(tmpLiteral.length() == 3) {
if(Token.AND.name().equals(tmpLiteral)) {
return finishAs(Token.AND);
} else if(Token.NOT.name().equals(tmpLiteral)) {
return finishAs(Token.NOT);
}
} else if(tmpLiteral.length() == 2 && Token.OR.name().equals(tmpLiteral)) {
return finishAs(Token.OR);
}
return super.finish();
}
@Override
public State close() {
return finish();
@ -367,6 +396,7 @@ public class SearchTokenizer {
}
private class SearchPhraseState extends LiteralState {
private boolean closed = false;
public SearchPhraseState(char c) throws SearchTokenizerException {
super(Token.PHRASE, c);
if (c != QUOTATION_MARK) {
@ -376,19 +406,34 @@ public class SearchTokenizer {
@Override
public State nextChar(char c) throws SearchTokenizerException {
if (isAllowedPhrase(c)) {
if(closed) {
finish();
if (c == CHAR_CLOSE) {
return new CloseState();
} else if (isWhitespace(c)) {
return new RwsState();
}
} else if (isAllowedPhrase(c)) {
return allowed(c);
} else if (isWhitespace(c)) {
return allowed(c);
} else if (c == QUOTATION_MARK) {
finish();
allowed(c);
return new SearchExpressionState();
} else if (isFinished()) {
return new SearchExpressionState().init(c);
if(literal.length() == 1) {
return invalid();
}
closed = true;
return allowed(c);
}
return forbidden(c);
}
@Override
public State close() {
if(closed) {
return finish();
}
return super.close();
}
}
private class OpenState extends State {
@ -564,6 +609,9 @@ public class SearchTokenizer {
if (state.close().isFinished()) {
states.add(state);
} else {
throw new SearchTokenizerException("Last parsed state '" + state.toString() + "' is not finished.",
SearchTokenizerException.MessageKeys.NOT_FINISHED_QUERY);
}
return states;

View File

@ -24,9 +24,16 @@ public class SearchTokenizerException extends UriParserSyntaxException {
private static final long serialVersionUID = -8295456415309640166L;
public static enum MessageKeys implements MessageKey {
public enum MessageKeys implements MessageKey {
/** parameter: character */
FORBIDDEN_CHARACTER,
FORBIDDEN_CHARACTER,
/** parameter: TOKEN */
NOT_EXPECTED_TOKEN,
/** parameter: - */
NOT_FINISHED_QUERY,
/** parameter: - */
INVALID_TOKEN_STATE,
/** parameter: - */
ALREADY_FINISHED;
@Override

View File

@ -91,7 +91,7 @@ public class SearchTokenizerTest {
SearchTokenizer tokenizer = new SearchTokenizer();
List<SearchQueryToken> result;
SearchValidator.init("abc AND \"x-y_z\" AND olingo").validate();
TokenizerValidator.init("abc AND \"x-y_z\" AND olingo").validate();
//
result = tokenizer.tokenize("\"abc\"");
@ -113,7 +113,7 @@ public class SearchTokenizerTest {
Assert.assertEquals(PHRASE, result.get(0).getToken());
Assert.assertEquals("\"99_88.\"", result.get(0).getLiteral());
SearchValidator.init("abc or \"xyz\"").addExpected(WORD, WORD, PHRASE).validate();
TokenizerValidator.init("abc or \"xyz\"").validate(WORD, WORD, PHRASE);
}
/**
@ -124,22 +124,22 @@ public class SearchTokenizerTest {
@Ignore("Test must be moved to SearchParserTest and SearchParserAndTokenizerTest")
public void parsePhraseAbnfTestcases() throws Exception {
// <TestCase Name="5.1.7 Search - simple phrase" Rule="queryOptions">
SearchValidator.init("\"blue%20green\"").validate();
TokenizerValidator.init("\"blue%20green\"").validate();
// <TestCase Name="5.1.7 Search - simple phrase" Rule="queryOptions">
SearchValidator.init("\"blue%20green%22").validate();
TokenizerValidator.init("\"blue%20green%22").validate();
// <TestCase Name="5.1.7 Search - phrase with escaped double-quote" Rule="queryOptions">
// <Input>$search="blue\"green"</Input>
SearchValidator.init("\"blue\\\"green\"").validate();
TokenizerValidator.init("\"blue\\\"green\"").validate();
// <TestCase Name="5.1.7 Search - phrase with escaped backslash" Rule="queryOptions">
// <Input>$search="blue\\green"</Input>
SearchValidator.init("\"blue\\\\green\"").validate();
TokenizerValidator.init("\"blue\\\\green\"").validate();
// <TestCase Name="5.1.7 Search - phrase with unescaped double-quote" Rule="queryOptions" FailAt="14">
SearchValidator.init("\"blue\"green\"").validate();
TokenizerValidator.init("\"blue\"green\"").validate();
// <TestCase Name="5.1.7 Search - phrase with unescaped double-quote" Rule="queryOptions" FailAt="16">
SearchValidator.init("\"blue%22green\"").validate();
TokenizerValidator.init("\"blue%22green\"").validate();
// <TestCase Name="5.1.7 Search - implicit AND" Rule="queryOptions">
// <Input>$search=blue green</Input>
@ -160,10 +160,10 @@ public class SearchTokenizerTest {
Assert.assertEquals(NOT, result.get(0).getToken());
Assert.assertEquals(WORD, result.get(1).getToken());
SearchValidator.init("not abc").addExpected(WORD, WORD).validate();
SearchValidator.init("NOT abc").addExpected(NOT, WORD).validate();
SearchValidator.init("NOT \"abc\"").addExpected(NOT, PHRASE).validate();
SearchValidator.init("NOT (sdf)").validate(SearchTokenizerException.class);
TokenizerValidator.init("not abc").addExpected(WORD, WORD).validate();
TokenizerValidator.init("NOT abc").addExpected(NOT, WORD).validate();
TokenizerValidator.init("NOT \"abc\"").addExpected(NOT, PHRASE).validate();
TokenizerValidator.init("NOT (sdf)").validate(SearchTokenizerException.class);
}
@Test
@ -187,16 +187,16 @@ public class SearchTokenizerTest {
Assert.assertEquals(OR, result.get(3).getToken());
Assert.assertEquals(WORD, result.get(4).getToken());
SearchValidator.init("abc or xyz").addExpected(WORD, WORD, WORD).validate();
TokenizerValidator.init("abc or xyz").addExpected(WORD, WORD, WORD).validate();
}
@Test
public void parseImplicitAnd() throws SearchTokenizerException {
SearchValidator.init("a b").addExpected(WORD, WORD).validate();
SearchValidator.init("a b OR c").addExpected(WORD, WORD, OR, WORD).validate();
SearchValidator.init("a bc OR c").addExpected(WORD, WORD, OR, WORD).validate();
SearchValidator.init("a bc c").addExpected(WORD, WORD, WORD).validate();
SearchValidator.init("(a OR x) bc c").addExpected(OPEN, WORD, OR, WORD, CLOSE, WORD, WORD).validate();
TokenizerValidator.init("a b").addExpected(WORD, WORD).validate();
TokenizerValidator.init("a b OR c").addExpected(WORD, WORD, OR, WORD).validate();
TokenizerValidator.init("a bc OR c").addExpected(WORD, WORD, OR, WORD).validate();
TokenizerValidator.init("a bc c").addExpected(WORD, WORD, WORD).validate();
TokenizerValidator.init("(a OR x) bc c").addExpected(OPEN, WORD, OR, WORD, CLOSE, WORD, WORD).validate();
}
@Test
@ -261,7 +261,7 @@ public class SearchTokenizerTest {
Assert.assertEquals(OR, result.get(3).getToken());
Assert.assertEquals(WORD, result.get(4).getToken());
SearchValidator.init("abc AND ANDsomething")
TokenizerValidator.init("abc AND ANDsomething")
.addExpected(WORD, AND, WORD).validate();
}
@ -282,7 +282,7 @@ public class SearchTokenizerTest {
Assert.assertEquals(OR, it.next().getToken());
Assert.assertEquals(WORD, it.next().getToken());
SearchValidator.init("foo AND bar OR foo AND baz OR that AND bar OR that AND baz")
TokenizerValidator.init("foo AND bar OR foo AND baz OR that AND bar OR that AND baz")
.addExpected(WORD, "foo").addExpected(AND)
.addExpected(WORD, "bar").addExpected(OR)
.addExpected(WORD, "foo").addExpected(AND)
@ -294,7 +294,7 @@ public class SearchTokenizerTest {
.validate();
SearchValidator.init("(foo OR that) AND (bar OR baz)")
TokenizerValidator.init("(foo OR that) AND (bar OR baz)")
.addExpected(OPEN)
.addExpected(WORD, "foo").addExpected(OR).addExpected(WORD, "that")
.addExpected(CLOSE).addExpected(AND).addExpected(OPEN)
@ -325,19 +325,19 @@ public class SearchTokenizerTest {
Assert.assertEquals(AND, it.next().getToken());
Assert.assertEquals(WORD, it.next().getToken());
SearchValidator.init("abc AND ANDsomething")
TokenizerValidator.init("abc AND ANDsomething")
.addExpected(WORD, AND, WORD).validate();
SearchValidator.init("abc ANDsomething")
TokenizerValidator.init("abc ANDsomething")
.addExpected(WORD, WORD).validate();
SearchValidator.init("abc ORsomething")
TokenizerValidator.init("abc ORsomething")
.addExpected(WORD, WORD).validate();
SearchValidator.init("abc OR orsomething")
TokenizerValidator.init("abc OR orsomething")
.addExpected(WORD, OR, WORD).validate();
SearchValidator.init("abc OR ORsomething")
TokenizerValidator.init("abc OR ORsomething")
.addExpected(WORD, OR, WORD).validate();
}
@ -345,7 +345,7 @@ public class SearchTokenizerTest {
@Test
public void unicodeInWords() throws Exception {
// Ll, Lm, Lo, Lt, Lu, Nl
SearchValidator.init("abc OR Ll\u01E3Lm\u02B5Lo\u1BE4Lt\u01F2Lu\u03D3Nl\u216F")
TokenizerValidator.init("abc OR Ll\u01E3Lm\u02B5Lo\u1BE4Lt\u01F2Lu\u03D3Nl\u216F")
.addExpected(WORD, OR, WORD).validate();
}
@ -369,7 +369,7 @@ public class SearchTokenizerTest {
*/
@Test
public void characterInPhrase() throws Exception {
SearchValidator.init("\"123\" OR \"ALPHA-._~\"")
TokenizerValidator.init("\"123\" OR \"ALPHA-._~\"")
.addExpected(PHRASE, OR, PHRASE).validate();
}
@ -395,7 +395,7 @@ public class SearchTokenizerTest {
validate("abc def ghi");
// mixed not
SearchValidator.init(" abc def AND ghi").validate(WORD, WORD, AND, WORD);
TokenizerValidator.init(" abc def AND ghi").validate(WORD, WORD, AND, WORD);
validate("NOT abc NOT def OR NOT ghi", NOT, WORD, NOT, WORD, OR, NOT, WORD);
validate(" abc def NOT ghi", WORD, WORD, NOT, WORD);
@ -409,26 +409,41 @@ public class SearchTokenizerTest {
}
@Test
public void parseInvalid() throws SearchTokenizerException {
SearchValidator.init("abc AND OR something").validate();
SearchValidator.init("abc AND \"something\" )").validate();
public void tokenizeInvalid() throws SearchTokenizerException {
//
SearchValidator.init("( abc AND) OR something").validate(SearchTokenizerException.class);
TokenizerValidator.init("( abc AND) OR something").validate(SearchTokenizerException.class);
TokenizerValidator.init("\"phrase\"word").validate(SearchTokenizerException.class);
TokenizerValidator.init("\"p\"w").validate(SearchTokenizerException.class);
TokenizerValidator.init("\"\"").validate(SearchTokenizerException.class);
}
@Test
public void tokenizeInvalidQueryForParser() throws SearchTokenizerException {
// TokenizerValidator.init("NOT").validate(NOT);
TokenizerValidator.init("AND").validate(AND);
TokenizerValidator.init("OR").validate(OR);
TokenizerValidator.init("NOT AND").validate(NOT, AND);
TokenizerValidator.init("NOT OR").validate(NOT, OR);
TokenizerValidator.init("NOT NOT").validate(NOT, NOT);
TokenizerValidator.init("abc AND OR something").validate(WORD, AND, OR, WORD);
TokenizerValidator.init("abc AND \"something\" )").validate(WORD, AND, PHRASE, CLOSE);
}
public void validate(String query) throws SearchTokenizerException {
new SearchValidator(query).validate();
new TokenizerValidator(query).validate();
}
public void validate(String query, SearchQueryToken.Token ... tokens) throws SearchTokenizerException {
SearchValidator sv = new SearchValidator(query);
TokenizerValidator sv = new TokenizerValidator(query);
for (SearchQueryToken.Token token : tokens) {
sv.addExpected(token);
}
sv.validate();
}
private static class SearchValidator {
private static class TokenizerValidator {
private List<Tuple> validations = new ArrayList<Tuple>();
private boolean log;
private final String searchQuery;
@ -450,24 +465,24 @@ public class SearchTokenizerTest {
}
}
private SearchValidator(String searchQuery) {
private TokenizerValidator(String searchQuery) {
this.searchQuery = searchQuery;
}
private static SearchValidator init(String searchQuery) {
return new SearchValidator(searchQuery);
private static TokenizerValidator init(String searchQuery) {
return new TokenizerValidator(searchQuery);
}
@SuppressWarnings("unused")
private SearchValidator enableLogging() {
private TokenizerValidator enableLogging() {
log = true;
return this;
}
private SearchValidator addExpected(SearchQueryToken.Token token, String literal) {
private TokenizerValidator addExpected(SearchQueryToken.Token token, String literal) {
validations.add(new Tuple(token, literal));
return this;
}
private SearchValidator addExpected(SearchQueryToken.Token ... token) {
private TokenizerValidator addExpected(SearchQueryToken.Token ... token) {
for (SearchQueryToken.Token t : token) {
validations.add(new Tuple(t));
}