From f9b2e971f2b7723887c4e154557a669e4e243cae Mon Sep 17 00:00:00 2001 From: Grant Ingersoll Date: Fri, 7 Dec 2007 12:21:49 +0000 Subject: [PATCH] LUCENE-1077 new sinks and payloads analysis packages git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@602081 13f79535-47bb-0310-9956-ffa450edef68 --- .../payloads/NumericPayloadTokenFilter.java | 85 ++++++++++++++++++ .../payloads/TypeAsPayloadTokenFilter.java | 49 +++++++++++ .../lucene/analysis/payloads/package.html | 31 +++++++ .../sinks/DateRecognizerSinkTokenizer.java | 87 +++++++++++++++++++ .../sinks/TokenRangeSinkTokenizer.java | 55 ++++++++++++ .../sinks/TokenTypeSinkTokenizer.java | 54 ++++++++++++ .../apache/lucene/analysis/sinks/package.html | 30 +++++++ .../NumericPayloadTokenFilterTest.java | 79 +++++++++++++++++ .../TypeAsPayloadTokenFilterTest.java | 76 ++++++++++++++++ .../DateRecognizerSinkTokenizerTest.java | 60 +++++++++++++ .../sinks/TokenRangeSinkTokenizerTest.java | 54 ++++++++++++ .../sinks/TokenTypeSinkTokenizerTest.java | 72 +++++++++++++++ 12 files changed, 732 insertions(+) create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/package.html create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizer.java create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/TokenRangeSinkTokenizer.java create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizer.java create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/package.html create mode 100644 contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterTest.java create mode 100644 contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterTest.java create mode 100644 contrib/analyzers/src/test/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizerTest.java create mode 100644 contrib/analyzers/src/test/org/apache/lucene/analysis/sinks/TokenRangeSinkTokenizerTest.java create mode 100644 contrib/analyzers/src/test/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizerTest.java diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java new file mode 100644 index 00000000000..13bbc618254 --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java @@ -0,0 +1,85 @@ +package org.apache.lucene.analysis.payloads; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.index.Payload; + +import java.io.IOException; + + +/** + * Assigns a payload to a token based on the {@link org.apache.lucene.analysis.Token#type()} + * + **/ +public class NumericPayloadTokenFilter extends TokenFilter { + + private String typeMatch; + private Payload thePayload; + + public NumericPayloadTokenFilter(TokenStream input, float payload, String typeMatch) { + super(input); + //Need to encode the payload + thePayload = new Payload(encodePayload(payload)); + this.typeMatch = typeMatch; + } + + public static byte[] encodePayload(float payload) { + byte[] result = new byte[4]; + int tmp = Float.floatToIntBits(payload); + result[0] = (byte)(tmp >> 24); + result[1] = (byte)(tmp >> 16); + result[2] = (byte)(tmp >> 8); + result[3] = (byte) tmp; + + return result; + } + + /** + * @see #decodePayload(byte[], int) + * @see #encodePayload(float) + */ + public static float decodePayload(byte [] bytes){ + return decodePayload(bytes, 0); + } + + /** + * Decode the payload that was encoded using {@link #encodePayload(float)}. + * NOTE: the length of the array must be at least offset + 4 long. + * @param bytes The bytes to decode + * @param offset The offset into the array. + * @return The float that was encoded + * + * @see #encodePayload(float) + */ + public static final float decodePayload(byte [] bytes, int offset){ + int tmp = ((bytes[offset] & 0xFF) << 24) | ((bytes[offset + 1] & 0xFF) << 16) + | ((bytes[offset + 2] & 0xFF) << 8) | (bytes[offset + 3] & 0xFF); + return Float.intBitsToFloat(tmp); + } + + public Token next(Token result) throws IOException { + result = input.next(result); + if (result != null && result.type().equals(typeMatch)){ + result.setPayload(thePayload); + } + return result; + } +} diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java new file mode 100644 index 00000000000..d4dbae13332 --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java @@ -0,0 +1,49 @@ +package org.apache.lucene.analysis.payloads; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.index.Payload; + +import java.io.IOException; + + +/** + * Makes the {@link org.apache.lucene.analysis.Token#type()} a payload. + * + * Encodes the type using {@link String#getBytes(String)} with "UTF-8" as the encoding + * + **/ +public class TypeAsPayloadTokenFilter extends TokenFilter { + + public TypeAsPayloadTokenFilter(TokenStream input) { + super(input); + + } + + + public Token next(Token result) throws IOException { + result = input.next(result); + if (result != null && result.type() != null && result.type().equals("") == false){ + result.setPayload(new Payload(result.type().getBytes("UTF-8"))); + } + return result; + } +} \ No newline at end of file diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/package.html b/contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/package.html new file mode 100644 index 00000000000..abababcc93f --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/package.html @@ -0,0 +1,31 @@ + + + org.apache.lucene.analysis.payloads + + +
Provides various convenience classes for creating payloads on Tokens. +
+
 
+
+Copyright © 2007 Apache Software Foundation +
+ + \ No newline at end of file diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizer.java new file mode 100644 index 00000000000..77f0f86ac29 --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizer.java @@ -0,0 +1,87 @@ +package org.apache.lucene.analysis.sinks; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.SinkTokenizer; +import org.apache.lucene.analysis.Token; + +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.text.ParseException; +import java.util.List; +import java.util.Date; + + +/** + * Attempts to parse the {@link org.apache.lucene.analysis.Token#termBuffer()} as a Date using a {@link java.text.DateFormat}. + * If the value is a Date, it will add it to the sink. + *

+ * Also marks the sink token with {@link org.apache.lucene.analysis.Token#type()} equal to {@link #DATE_TYPE} + * + * + **/ +public class DateRecognizerSinkTokenizer extends SinkTokenizer { + public static final String DATE_TYPE = "date"; + + protected DateFormat dateFormat; + + /** + * Uses {@link java.text.SimpleDateFormat#getDateInstance()} as the {@link java.text.DateFormat} object. + */ + public DateRecognizerSinkTokenizer() { + this(null, SimpleDateFormat.getDateInstance()); + } + + public DateRecognizerSinkTokenizer(DateFormat dateFormat) { + this(null, dateFormat); + } + + /** + * Uses {@link java.text.SimpleDateFormat#getDateInstance()} as the {@link java.text.DateFormat} object. + * @param input The input list of Tokens that are already Dates. They should be marked as type {@link #DATE_TYPE} for completeness + */ + public DateRecognizerSinkTokenizer(List/**/ input) { + this(input, SimpleDateFormat.getDateInstance()); + } + + /** + * + * @param input + * @param dateFormat The date format to use to try and parse the date. Note, this SinkTokenizer makes no attempt to synchronize the DateFormat object + */ + public DateRecognizerSinkTokenizer(List/**/ input, DateFormat dateFormat) { + super(input); + this.dateFormat = dateFormat; + } + + + public void add(Token t) { + //Check to see if this token is a date + if (t != null) { + try { + Date date = dateFormat.parse(new String(t.termBuffer(), 0, t.termLength()));//We don't care about the date, just that we can parse it as a date + if (date != null) { + t.setType(DATE_TYPE); + lst.add(t.clone()); + } + } catch (ParseException e) { + + } + } + + } +} diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/TokenRangeSinkTokenizer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/TokenRangeSinkTokenizer.java new file mode 100644 index 00000000000..533e9d1577b --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/TokenRangeSinkTokenizer.java @@ -0,0 +1,55 @@ +package org.apache.lucene.analysis.sinks; + +import org.apache.lucene.analysis.SinkTokenizer; +import org.apache.lucene.analysis.Token; + +import java.io.IOException; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * Counts the tokens as they go by and saves to the internal list those between the range of lower and upper, exclusive of upper + * + **/ +public class TokenRangeSinkTokenizer extends SinkTokenizer { + private int lower; + private int upper; + private int count; + + public TokenRangeSinkTokenizer(int lower, int upper) { + this.lower = lower; + this.upper = upper; + } + + public TokenRangeSinkTokenizer(int initCap, int lower, int upper) { + super(initCap); + this.lower = lower; + this.upper = upper; + } + + public void add(Token t) { + if (count >= lower && count < upper){ + super.add(t); + } + count++; + } + + public void reset() throws IOException { + count = 0; + } +} diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizer.java new file mode 100644 index 00000000000..f814555035c --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizer.java @@ -0,0 +1,54 @@ +package org.apache.lucene.analysis.sinks; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.SinkTokenizer; +import org.apache.lucene.analysis.Token; + +import java.util.List; + + +/** + * If the {@link org.apache.lucene.analysis.Token#type()} matches the passed in typeToMatch then + * add it to the sink + * + **/ +public class TokenTypeSinkTokenizer extends SinkTokenizer { + + private String typeToMatch; + + public TokenTypeSinkTokenizer(String typeToMatch) { + this.typeToMatch = typeToMatch; + } + + public TokenTypeSinkTokenizer(int initCap, String typeToMatch) { + super(initCap); + this.typeToMatch = typeToMatch; + } + + public TokenTypeSinkTokenizer(List/**/ input, String typeToMatch) { + super(input); + this.typeToMatch = typeToMatch; + } + + public void add(Token t) { + //check to see if this is a Category + if (t != null && typeToMatch.equals(t.type())){ + lst.add(t.clone()); + } + } +} diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/package.html b/contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/package.html new file mode 100644 index 00000000000..f60cd21d441 --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/sinks/package.html @@ -0,0 +1,30 @@ + + + org.apache.lucene.analysis.sinks + + +

Implementations of the SinkTokenizer that might be useful. +
+
 
+
+Copyright © 2007 Apache Software Foundation +
+ + \ No newline at end of file diff --git a/contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterTest.java b/contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterTest.java new file mode 100644 index 00000000000..edfbb72abd0 --- /dev/null +++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterTest.java @@ -0,0 +1,79 @@ +package org.apache.lucene.analysis.payloads; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.WhitespaceTokenizer; + +import java.io.IOException; +import java.io.StringReader; + +public class NumericPayloadTokenFilterTest extends TestCase { + + + public NumericPayloadTokenFilterTest(String s) { + super(s); + } + + protected void setUp() { + } + + protected void tearDown() { + + } + + public void test() throws IOException { + String test = "The quick red fox jumped over the lazy brown dogs"; + + NumericPayloadTokenFilter nptf = new NumericPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))), 3, "D"); + Token tok = new Token(); + boolean seenDogs = false; + while ((tok = nptf.next(tok)) != null){ + if (tok.termText().equals("dogs")){ + seenDogs = true; + assertTrue(tok.type() + " is not equal to " + "D", tok.type().equals("D") == true); + assertTrue("tok.getPayload() is null and it shouldn't be", tok.getPayload() != null); + byte [] bytes = tok.getPayload().getData();//safe here to just use the bytes, otherwise we should use offset, length + assertTrue(bytes.length + " does not equal: " + tok.getPayload().length(), bytes.length == tok.getPayload().length()); + assertTrue(tok.getPayload().getOffset() + " does not equal: " + 0, tok.getPayload().getOffset() == 0); + float pay = NumericPayloadTokenFilter.decodePayload(bytes); + assertTrue(pay + " does not equal: " + 3, pay == 3); + } else { + assertTrue(tok.type() + " is not null and it should be", tok.type().equals("word")); + } + } + assertTrue(seenDogs + " does not equal: " + true, seenDogs == true); + } + + private class WordTokenFilter extends TokenFilter { + private WordTokenFilter(TokenStream input) { + super(input); + } + + public Token next(Token result) throws IOException { + result = input.next(result); + if (result != null && result.termText().equals("dogs")) { + result.setType("D"); + } + return result; + } + } + +} \ No newline at end of file diff --git a/contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterTest.java b/contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterTest.java new file mode 100644 index 00000000000..03f964d30cd --- /dev/null +++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterTest.java @@ -0,0 +1,76 @@ +package org.apache.lucene.analysis.payloads; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.WhitespaceTokenizer; + +import java.io.IOException; +import java.io.StringReader; + +public class TypeAsPayloadTokenFilterTest extends TestCase { + + + public TypeAsPayloadTokenFilterTest(String s) { + super(s); + } + + protected void setUp() { + } + + protected void tearDown() { + + } + + + public void test() throws IOException { + String test = "The quick red fox jumped over the lazy brown dogs"; + + TypeAsPayloadTokenFilter nptf = new TypeAsPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test)))); + Token tok = new Token(); + int count = 0; + while ((tok = nptf.next(tok)) != null){ + assertTrue(tok.type() + " is not null and it should be", tok.type().equals(String.valueOf(Character.toUpperCase(tok.termBuffer()[0])))); + assertTrue("tok.getPayload() is null and it shouldn't be", tok.getPayload() != null); + String type = new String(tok.getPayload().getData(), "UTF-8"); + assertTrue("type is null and it shouldn't be", type != null); + assertTrue(type + " is not equal to " + tok.type(), type.equals(tok.type()) == true); + count++; + } + assertTrue(count + " does not equal: " + 10, count == 10); + } + + private class WordTokenFilter extends TokenFilter { + private WordTokenFilter(TokenStream input) { + super(input); + } + + + + public Token next(Token result) throws IOException { + result = input.next(result); + if (result != null) { + result.setType(String.valueOf(Character.toUpperCase(result.termBuffer()[0]))); + } + return result; + } + } + +} \ No newline at end of file diff --git a/contrib/analyzers/src/test/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizerTest.java b/contrib/analyzers/src/test/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizerTest.java new file mode 100644 index 00000000000..64868747c57 --- /dev/null +++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/sinks/DateRecognizerSinkTokenizerTest.java @@ -0,0 +1,60 @@ +package org.apache.lucene.analysis.sinks; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; +import org.apache.lucene.analysis.TeeTokenFilter; +import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.lucene.analysis.Token; + +import java.io.StringReader; +import java.io.IOException; +import java.text.SimpleDateFormat; + +public class DateRecognizerSinkTokenizerTest extends TestCase { + + + public DateRecognizerSinkTokenizerTest(String s) { + super(s); + } + + protected void setUp() { + } + + protected void tearDown() { + + } + + public void test() throws IOException { + DateRecognizerSinkTokenizer sink = new DateRecognizerSinkTokenizer(new SimpleDateFormat("MM/dd/yyyy")); + String test = "The quick red fox jumped over the lazy brown dogs on 7/11/2006 The dogs finally reacted on 7/12/2006"; + TeeTokenFilter tee = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(test)), sink); + Token tok = null; + int count = 0; + while ((tok = tee.next()) != null){ + assertTrue("tok is null and it shouldn't be", tok != null); + if (tok.termBuffer()[0] == '7'){ + assertTrue(tok.type() + " is not equal to " + DateRecognizerSinkTokenizer.DATE_TYPE, + tok.type().equals(DateRecognizerSinkTokenizer.DATE_TYPE) == true); + } + count++; + } + assertTrue(count + " does not equal: " + 18, count == 18); + assertTrue("sink Size: " + sink.getTokens().size() + " is not: " + 2, sink.getTokens().size() == 2); + + } +} \ No newline at end of file diff --git a/contrib/analyzers/src/test/org/apache/lucene/analysis/sinks/TokenRangeSinkTokenizerTest.java b/contrib/analyzers/src/test/org/apache/lucene/analysis/sinks/TokenRangeSinkTokenizerTest.java new file mode 100644 index 00000000000..71e849301fb --- /dev/null +++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/sinks/TokenRangeSinkTokenizerTest.java @@ -0,0 +1,54 @@ +package org.apache.lucene.analysis.sinks; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; +import org.apache.lucene.analysis.TeeTokenFilter; +import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.lucene.analysis.Token; + +import java.io.StringReader; +import java.io.IOException; + +public class TokenRangeSinkTokenizerTest extends TestCase { + + + public TokenRangeSinkTokenizerTest(String s) { + super(s); + } + + protected void setUp() { + } + + protected void tearDown() { + + } + + public void test() throws IOException { + TokenRangeSinkTokenizer rangeToks = new TokenRangeSinkTokenizer(2, 4); + String test = "The quick red fox jumped over the lazy brown dogs"; + TeeTokenFilter tee = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(test)), rangeToks); + Token tok = null; + int count = 0; + while ((tok = tee.next()) != null){ + assertTrue("tok is null and it shouldn't be", tok != null); + count++; + } + assertTrue(count + " does not equal: " + 10, count == 10); + assertTrue("rangeToks Size: " + rangeToks.getTokens().size() + " is not: " + 2, rangeToks.getTokens().size() == 2); + } +} \ No newline at end of file diff --git a/contrib/analyzers/src/test/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizerTest.java b/contrib/analyzers/src/test/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizerTest.java new file mode 100644 index 00000000000..db57bb7fba8 --- /dev/null +++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizerTest.java @@ -0,0 +1,72 @@ +package org.apache.lucene.analysis.sinks; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; +import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.payloads.NumericPayloadTokenFilter; + +import java.io.IOException; +import java.io.StringReader; + +public class TokenTypeSinkTokenizerTest extends TestCase { + + + public TokenTypeSinkTokenizerTest(String s) { + super(s); + } + + protected void setUp() { + } + + protected void tearDown() { + + } + + public void test() throws IOException { + TokenTypeSinkTokenizer sink = new TokenTypeSinkTokenizer("D"); + String test = "The quick red fox jumped over the lazy brown dogs"; + + TeeTokenFilter ttf = new TeeTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))), sink); + Token tok = new Token(); + boolean seenDogs = false; + while ((tok = ttf.next(tok)) != null) { + if (tok.termText().equals("dogs")) { + seenDogs = true; + assertTrue(tok.type() + " is not equal to " + "D", tok.type().equals("D") == true); + } else { + assertTrue(tok.type() + " is not null and it should be", tok.type().equals("word")); + } + } + assertTrue(seenDogs + " does not equal: " + true, seenDogs == true); + assertTrue("sink Size: " + sink.getTokens().size() + " is not: " + 1, sink.getTokens().size() == 1); + } + + private class WordTokenFilter extends TokenFilter { + private WordTokenFilter(TokenStream input) { + super(input); + } + + public Token next(Token result) throws IOException { + result = input.next(result); + if (result != null && result.termText().equals("dogs")) { + result.setType("D"); + } + return result; + } + } +} \ No newline at end of file