LUCENE-1077 new sinks and payloads analysis packages

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@602081 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2007-12-07 12:21:49 +00:00
parent ce9b80539d
commit f9b2e971f2
12 changed files with 732 additions and 0 deletions

View File

@ -0,0 +1,85 @@
package org.apache.lucene.analysis.payloads;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.index.Payload;
import java.io.IOException;
/**
* Assigns a payload to a token based on the {@link org.apache.lucene.analysis.Token#type()}
*
**/
public class NumericPayloadTokenFilter extends TokenFilter {
private String typeMatch;
private Payload thePayload;
public NumericPayloadTokenFilter(TokenStream input, float payload, String typeMatch) {
super(input);
//Need to encode the payload
thePayload = new Payload(encodePayload(payload));
this.typeMatch = typeMatch;
}
public static byte[] encodePayload(float payload) {
byte[] result = new byte[4];
int tmp = Float.floatToIntBits(payload);
result[0] = (byte)(tmp >> 24);
result[1] = (byte)(tmp >> 16);
result[2] = (byte)(tmp >> 8);
result[3] = (byte) tmp;
return result;
}
/**
* @see #decodePayload(byte[], int)
* @see #encodePayload(float)
*/
public static float decodePayload(byte [] bytes){
return decodePayload(bytes, 0);
}
/**
* Decode the payload that was encoded using {@link #encodePayload(float)}.
* NOTE: the length of the array must be at least offset + 4 long.
* @param bytes The bytes to decode
* @param offset The offset into the array.
* @return The float that was encoded
*
* @see #encodePayload(float)
*/
public static final float decodePayload(byte [] bytes, int offset){
int tmp = ((bytes[offset] & 0xFF) << 24) | ((bytes[offset + 1] & 0xFF) << 16)
| ((bytes[offset + 2] & 0xFF) << 8) | (bytes[offset + 3] & 0xFF);
return Float.intBitsToFloat(tmp);
}
public Token next(Token result) throws IOException {
result = input.next(result);
if (result != null && result.type().equals(typeMatch)){
result.setPayload(thePayload);
}
return result;
}
}

View File

@ -0,0 +1,49 @@
package org.apache.lucene.analysis.payloads;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.index.Payload;
import java.io.IOException;
/**
* Makes the {@link org.apache.lucene.analysis.Token#type()} a payload.
*
* Encodes the type using {@link String#getBytes(String)} with "UTF-8" as the encoding
*
**/
public class TypeAsPayloadTokenFilter extends TokenFilter {
public TypeAsPayloadTokenFilter(TokenStream input) {
super(input);
}
public Token next(Token result) throws IOException {
result = input.next(result);
if (result != null && result.type() != null && result.type().equals("") == false){
result.setPayload(new Payload(result.type().getBytes("UTF-8")));
}
return result;
}
}

View File

@ -0,0 +1,31 @@
<HTML>
<!--
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
--><HEAD>
<TITLE>org.apache.lucene.analysis.payloads</TITLE>
</HEAD>
<BODY>
<DIV>Provides various convenience classes for creating payloads on Tokens.
</DIV>
<DIV>&nbsp;</DIV>
<DIV align="center">
Copyright &copy; 2007 <A HREF="http://www.apache.org">Apache Software Foundation</A>
</DIV>
</BODY>
</HTML>

View File

@ -0,0 +1,87 @@
package org.apache.lucene.analysis.sinks;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.SinkTokenizer;
import org.apache.lucene.analysis.Token;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.text.ParseException;
import java.util.List;
import java.util.Date;
/**
* Attempts to parse the {@link org.apache.lucene.analysis.Token#termBuffer()} as a Date using a {@link java.text.DateFormat}.
* If the value is a Date, it will add it to the sink.
* <p/>
* Also marks the sink token with {@link org.apache.lucene.analysis.Token#type()} equal to {@link #DATE_TYPE}
*
*
**/
public class DateRecognizerSinkTokenizer extends SinkTokenizer {
public static final String DATE_TYPE = "date";
protected DateFormat dateFormat;
/**
* Uses {@link java.text.SimpleDateFormat#getDateInstance()} as the {@link java.text.DateFormat} object.
*/
public DateRecognizerSinkTokenizer() {
this(null, SimpleDateFormat.getDateInstance());
}
public DateRecognizerSinkTokenizer(DateFormat dateFormat) {
this(null, dateFormat);
}
/**
* Uses {@link java.text.SimpleDateFormat#getDateInstance()} as the {@link java.text.DateFormat} object.
* @param input The input list of Tokens that are already Dates. They should be marked as type {@link #DATE_TYPE} for completeness
*/
public DateRecognizerSinkTokenizer(List/*<Token>*/ input) {
this(input, SimpleDateFormat.getDateInstance());
}
/**
*
* @param input
* @param dateFormat The date format to use to try and parse the date. Note, this SinkTokenizer makes no attempt to synchronize the DateFormat object
*/
public DateRecognizerSinkTokenizer(List/*<Token>*/ input, DateFormat dateFormat) {
super(input);
this.dateFormat = dateFormat;
}
public void add(Token t) {
//Check to see if this token is a date
if (t != null) {
try {
Date date = dateFormat.parse(new String(t.termBuffer(), 0, t.termLength()));//We don't care about the date, just that we can parse it as a date
if (date != null) {
t.setType(DATE_TYPE);
lst.add(t.clone());
}
} catch (ParseException e) {
}
}
}
}

View File

@ -0,0 +1,55 @@
package org.apache.lucene.analysis.sinks;
import org.apache.lucene.analysis.SinkTokenizer;
import org.apache.lucene.analysis.Token;
import java.io.IOException;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Counts the tokens as they go by and saves to the internal list those between the range of lower and upper, exclusive of upper
*
**/
public class TokenRangeSinkTokenizer extends SinkTokenizer {
private int lower;
private int upper;
private int count;
public TokenRangeSinkTokenizer(int lower, int upper) {
this.lower = lower;
this.upper = upper;
}
public TokenRangeSinkTokenizer(int initCap, int lower, int upper) {
super(initCap);
this.lower = lower;
this.upper = upper;
}
public void add(Token t) {
if (count >= lower && count < upper){
super.add(t);
}
count++;
}
public void reset() throws IOException {
count = 0;
}
}

View File

@ -0,0 +1,54 @@
package org.apache.lucene.analysis.sinks;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.SinkTokenizer;
import org.apache.lucene.analysis.Token;
import java.util.List;
/**
* If the {@link org.apache.lucene.analysis.Token#type()} matches the passed in <code>typeToMatch</code> then
* add it to the sink
*
**/
public class TokenTypeSinkTokenizer extends SinkTokenizer {
private String typeToMatch;
public TokenTypeSinkTokenizer(String typeToMatch) {
this.typeToMatch = typeToMatch;
}
public TokenTypeSinkTokenizer(int initCap, String typeToMatch) {
super(initCap);
this.typeToMatch = typeToMatch;
}
public TokenTypeSinkTokenizer(List/*<Token>*/ input, String typeToMatch) {
super(input);
this.typeToMatch = typeToMatch;
}
public void add(Token t) {
//check to see if this is a Category
if (t != null && typeToMatch.equals(t.type())){
lst.add(t.clone());
}
}
}

View File

@ -0,0 +1,30 @@
<HTML>
<!--
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
--><HEAD>
<TITLE>org.apache.lucene.analysis.sinks</TITLE>
</HEAD>
<BODY>
<DIV>Implementations of the SinkTokenizer that might be useful.
</DIV>
<DIV>&nbsp;</DIV>
<DIV align="center">
Copyright &copy; 2007 <A HREF="http://www.apache.org">Apache Software Foundation</A>
</DIV>
</BODY>
</HTML>

View File

@ -0,0 +1,79 @@
package org.apache.lucene.analysis.payloads;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import junit.framework.TestCase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import java.io.IOException;
import java.io.StringReader;
public class NumericPayloadTokenFilterTest extends TestCase {
public NumericPayloadTokenFilterTest(String s) {
super(s);
}
protected void setUp() {
}
protected void tearDown() {
}
public void test() throws IOException {
String test = "The quick red fox jumped over the lazy brown dogs";
NumericPayloadTokenFilter nptf = new NumericPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))), 3, "D");
Token tok = new Token();
boolean seenDogs = false;
while ((tok = nptf.next(tok)) != null){
if (tok.termText().equals("dogs")){
seenDogs = true;
assertTrue(tok.type() + " is not equal to " + "D", tok.type().equals("D") == true);
assertTrue("tok.getPayload() is null and it shouldn't be", tok.getPayload() != null);
byte [] bytes = tok.getPayload().getData();//safe here to just use the bytes, otherwise we should use offset, length
assertTrue(bytes.length + " does not equal: " + tok.getPayload().length(), bytes.length == tok.getPayload().length());
assertTrue(tok.getPayload().getOffset() + " does not equal: " + 0, tok.getPayload().getOffset() == 0);
float pay = NumericPayloadTokenFilter.decodePayload(bytes);
assertTrue(pay + " does not equal: " + 3, pay == 3);
} else {
assertTrue(tok.type() + " is not null and it should be", tok.type().equals("word"));
}
}
assertTrue(seenDogs + " does not equal: " + true, seenDogs == true);
}
private class WordTokenFilter extends TokenFilter {
private WordTokenFilter(TokenStream input) {
super(input);
}
public Token next(Token result) throws IOException {
result = input.next(result);
if (result != null && result.termText().equals("dogs")) {
result.setType("D");
}
return result;
}
}
}

View File

@ -0,0 +1,76 @@
package org.apache.lucene.analysis.payloads;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import junit.framework.TestCase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import java.io.IOException;
import java.io.StringReader;
public class TypeAsPayloadTokenFilterTest extends TestCase {
public TypeAsPayloadTokenFilterTest(String s) {
super(s);
}
protected void setUp() {
}
protected void tearDown() {
}
public void test() throws IOException {
String test = "The quick red fox jumped over the lazy brown dogs";
TypeAsPayloadTokenFilter nptf = new TypeAsPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))));
Token tok = new Token();
int count = 0;
while ((tok = nptf.next(tok)) != null){
assertTrue(tok.type() + " is not null and it should be", tok.type().equals(String.valueOf(Character.toUpperCase(tok.termBuffer()[0]))));
assertTrue("tok.getPayload() is null and it shouldn't be", tok.getPayload() != null);
String type = new String(tok.getPayload().getData(), "UTF-8");
assertTrue("type is null and it shouldn't be", type != null);
assertTrue(type + " is not equal to " + tok.type(), type.equals(tok.type()) == true);
count++;
}
assertTrue(count + " does not equal: " + 10, count == 10);
}
private class WordTokenFilter extends TokenFilter {
private WordTokenFilter(TokenStream input) {
super(input);
}
public Token next(Token result) throws IOException {
result = input.next(result);
if (result != null) {
result.setType(String.valueOf(Character.toUpperCase(result.termBuffer()[0])));
}
return result;
}
}
}

View File

@ -0,0 +1,60 @@
package org.apache.lucene.analysis.sinks;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import junit.framework.TestCase;
import org.apache.lucene.analysis.TeeTokenFilter;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.Token;
import java.io.StringReader;
import java.io.IOException;
import java.text.SimpleDateFormat;
public class DateRecognizerSinkTokenizerTest extends TestCase {
public DateRecognizerSinkTokenizerTest(String s) {
super(s);
}
protected void setUp() {
}
protected void tearDown() {
}
public void test() throws IOException {
DateRecognizerSinkTokenizer sink = new DateRecognizerSinkTokenizer(new SimpleDateFormat("MM/dd/yyyy"));
String test = "The quick red fox jumped over the lazy brown dogs on 7/11/2006 The dogs finally reacted on 7/12/2006";
TeeTokenFilter tee = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(test)), sink);
Token tok = null;
int count = 0;
while ((tok = tee.next()) != null){
assertTrue("tok is null and it shouldn't be", tok != null);
if (tok.termBuffer()[0] == '7'){
assertTrue(tok.type() + " is not equal to " + DateRecognizerSinkTokenizer.DATE_TYPE,
tok.type().equals(DateRecognizerSinkTokenizer.DATE_TYPE) == true);
}
count++;
}
assertTrue(count + " does not equal: " + 18, count == 18);
assertTrue("sink Size: " + sink.getTokens().size() + " is not: " + 2, sink.getTokens().size() == 2);
}
}

View File

@ -0,0 +1,54 @@
package org.apache.lucene.analysis.sinks;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import junit.framework.TestCase;
import org.apache.lucene.analysis.TeeTokenFilter;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.Token;
import java.io.StringReader;
import java.io.IOException;
public class TokenRangeSinkTokenizerTest extends TestCase {
public TokenRangeSinkTokenizerTest(String s) {
super(s);
}
protected void setUp() {
}
protected void tearDown() {
}
public void test() throws IOException {
TokenRangeSinkTokenizer rangeToks = new TokenRangeSinkTokenizer(2, 4);
String test = "The quick red fox jumped over the lazy brown dogs";
TeeTokenFilter tee = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(test)), rangeToks);
Token tok = null;
int count = 0;
while ((tok = tee.next()) != null){
assertTrue("tok is null and it shouldn't be", tok != null);
count++;
}
assertTrue(count + " does not equal: " + 10, count == 10);
assertTrue("rangeToks Size: " + rangeToks.getTokens().size() + " is not: " + 2, rangeToks.getTokens().size() == 2);
}
}

View File

@ -0,0 +1,72 @@
package org.apache.lucene.analysis.sinks;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import junit.framework.TestCase;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.payloads.NumericPayloadTokenFilter;
import java.io.IOException;
import java.io.StringReader;
public class TokenTypeSinkTokenizerTest extends TestCase {
public TokenTypeSinkTokenizerTest(String s) {
super(s);
}
protected void setUp() {
}
protected void tearDown() {
}
public void test() throws IOException {
TokenTypeSinkTokenizer sink = new TokenTypeSinkTokenizer("D");
String test = "The quick red fox jumped over the lazy brown dogs";
TeeTokenFilter ttf = new TeeTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))), sink);
Token tok = new Token();
boolean seenDogs = false;
while ((tok = ttf.next(tok)) != null) {
if (tok.termText().equals("dogs")) {
seenDogs = true;
assertTrue(tok.type() + " is not equal to " + "D", tok.type().equals("D") == true);
} else {
assertTrue(tok.type() + " is not null and it should be", tok.type().equals("word"));
}
}
assertTrue(seenDogs + " does not equal: " + true, seenDogs == true);
assertTrue("sink Size: " + sink.getTokens().size() + " is not: " + 1, sink.getTokens().size() == 1);
}
private class WordTokenFilter extends TokenFilter {
private WordTokenFilter(TokenStream input) {
super(input);
}
public Token next(Token result) throws IOException {
result = input.next(result);
if (result != null && result.termText().equals("dogs")) {
result.setType("D");
}
return result;
}
}
}