mirror of https://github.com/apache/lucene.git
LUCENE-1077 new sinks and payloads analysis packages
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@602081 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ce9b80539d
commit
f9b2e971f2
|
@ -0,0 +1,85 @@
|
||||||
|
package org.apache.lucene.analysis.payloads;
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.index.Payload;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Assigns a payload to a token based on the {@link org.apache.lucene.analysis.Token#type()}
|
||||||
|
*
|
||||||
|
**/
|
||||||
|
public class NumericPayloadTokenFilter extends TokenFilter {
|
||||||
|
|
||||||
|
private String typeMatch;
|
||||||
|
private Payload thePayload;
|
||||||
|
|
||||||
|
public NumericPayloadTokenFilter(TokenStream input, float payload, String typeMatch) {
|
||||||
|
super(input);
|
||||||
|
//Need to encode the payload
|
||||||
|
thePayload = new Payload(encodePayload(payload));
|
||||||
|
this.typeMatch = typeMatch;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static byte[] encodePayload(float payload) {
|
||||||
|
byte[] result = new byte[4];
|
||||||
|
int tmp = Float.floatToIntBits(payload);
|
||||||
|
result[0] = (byte)(tmp >> 24);
|
||||||
|
result[1] = (byte)(tmp >> 16);
|
||||||
|
result[2] = (byte)(tmp >> 8);
|
||||||
|
result[3] = (byte) tmp;
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @see #decodePayload(byte[], int)
|
||||||
|
* @see #encodePayload(float)
|
||||||
|
*/
|
||||||
|
public static float decodePayload(byte [] bytes){
|
||||||
|
return decodePayload(bytes, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Decode the payload that was encoded using {@link #encodePayload(float)}.
|
||||||
|
* NOTE: the length of the array must be at least offset + 4 long.
|
||||||
|
* @param bytes The bytes to decode
|
||||||
|
* @param offset The offset into the array.
|
||||||
|
* @return The float that was encoded
|
||||||
|
*
|
||||||
|
* @see #encodePayload(float)
|
||||||
|
*/
|
||||||
|
public static final float decodePayload(byte [] bytes, int offset){
|
||||||
|
int tmp = ((bytes[offset] & 0xFF) << 24) | ((bytes[offset + 1] & 0xFF) << 16)
|
||||||
|
| ((bytes[offset + 2] & 0xFF) << 8) | (bytes[offset + 3] & 0xFF);
|
||||||
|
return Float.intBitsToFloat(tmp);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Token next(Token result) throws IOException {
|
||||||
|
result = input.next(result);
|
||||||
|
if (result != null && result.type().equals(typeMatch)){
|
||||||
|
result.setPayload(thePayload);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,49 @@
|
||||||
|
package org.apache.lucene.analysis.payloads;
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.index.Payload;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Makes the {@link org.apache.lucene.analysis.Token#type()} a payload.
|
||||||
|
*
|
||||||
|
* Encodes the type using {@link String#getBytes(String)} with "UTF-8" as the encoding
|
||||||
|
*
|
||||||
|
**/
|
||||||
|
public class TypeAsPayloadTokenFilter extends TokenFilter {
|
||||||
|
|
||||||
|
public TypeAsPayloadTokenFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Token next(Token result) throws IOException {
|
||||||
|
result = input.next(result);
|
||||||
|
if (result != null && result.type() != null && result.type().equals("") == false){
|
||||||
|
result.setPayload(new Payload(result.type().getBytes("UTF-8")));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,31 @@
|
||||||
|
<HTML>
|
||||||
|
<!--
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
--><HEAD>
|
||||||
|
<TITLE>org.apache.lucene.analysis.payloads</TITLE>
|
||||||
|
</HEAD>
|
||||||
|
<BODY>
|
||||||
|
<DIV>Provides various convenience classes for creating payloads on Tokens.
|
||||||
|
</DIV>
|
||||||
|
<DIV> </DIV>
|
||||||
|
<DIV align="center">
|
||||||
|
Copyright © 2007 <A HREF="http://www.apache.org">Apache Software Foundation</A>
|
||||||
|
</DIV>
|
||||||
|
</BODY>
|
||||||
|
</HTML>
|
|
@ -0,0 +1,87 @@
|
||||||
|
package org.apache.lucene.analysis.sinks;
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.SinkTokenizer;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
|
||||||
|
import java.text.DateFormat;
|
||||||
|
import java.text.SimpleDateFormat;
|
||||||
|
import java.text.ParseException;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Date;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Attempts to parse the {@link org.apache.lucene.analysis.Token#termBuffer()} as a Date using a {@link java.text.DateFormat}.
|
||||||
|
* If the value is a Date, it will add it to the sink.
|
||||||
|
* <p/>
|
||||||
|
* Also marks the sink token with {@link org.apache.lucene.analysis.Token#type()} equal to {@link #DATE_TYPE}
|
||||||
|
*
|
||||||
|
*
|
||||||
|
**/
|
||||||
|
public class DateRecognizerSinkTokenizer extends SinkTokenizer {
|
||||||
|
public static final String DATE_TYPE = "date";
|
||||||
|
|
||||||
|
protected DateFormat dateFormat;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Uses {@link java.text.SimpleDateFormat#getDateInstance()} as the {@link java.text.DateFormat} object.
|
||||||
|
*/
|
||||||
|
public DateRecognizerSinkTokenizer() {
|
||||||
|
this(null, SimpleDateFormat.getDateInstance());
|
||||||
|
}
|
||||||
|
|
||||||
|
public DateRecognizerSinkTokenizer(DateFormat dateFormat) {
|
||||||
|
this(null, dateFormat);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Uses {@link java.text.SimpleDateFormat#getDateInstance()} as the {@link java.text.DateFormat} object.
|
||||||
|
* @param input The input list of Tokens that are already Dates. They should be marked as type {@link #DATE_TYPE} for completeness
|
||||||
|
*/
|
||||||
|
public DateRecognizerSinkTokenizer(List/*<Token>*/ input) {
|
||||||
|
this(input, SimpleDateFormat.getDateInstance());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param input
|
||||||
|
* @param dateFormat The date format to use to try and parse the date. Note, this SinkTokenizer makes no attempt to synchronize the DateFormat object
|
||||||
|
*/
|
||||||
|
public DateRecognizerSinkTokenizer(List/*<Token>*/ input, DateFormat dateFormat) {
|
||||||
|
super(input);
|
||||||
|
this.dateFormat = dateFormat;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void add(Token t) {
|
||||||
|
//Check to see if this token is a date
|
||||||
|
if (t != null) {
|
||||||
|
try {
|
||||||
|
Date date = dateFormat.parse(new String(t.termBuffer(), 0, t.termLength()));//We don't care about the date, just that we can parse it as a date
|
||||||
|
if (date != null) {
|
||||||
|
t.setType(DATE_TYPE);
|
||||||
|
lst.add(t.clone());
|
||||||
|
}
|
||||||
|
} catch (ParseException e) {
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,55 @@
|
||||||
|
package org.apache.lucene.analysis.sinks;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.SinkTokenizer;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Counts the tokens as they go by and saves to the internal list those between the range of lower and upper, exclusive of upper
|
||||||
|
*
|
||||||
|
**/
|
||||||
|
public class TokenRangeSinkTokenizer extends SinkTokenizer {
|
||||||
|
private int lower;
|
||||||
|
private int upper;
|
||||||
|
private int count;
|
||||||
|
|
||||||
|
public TokenRangeSinkTokenizer(int lower, int upper) {
|
||||||
|
this.lower = lower;
|
||||||
|
this.upper = upper;
|
||||||
|
}
|
||||||
|
|
||||||
|
public TokenRangeSinkTokenizer(int initCap, int lower, int upper) {
|
||||||
|
super(initCap);
|
||||||
|
this.lower = lower;
|
||||||
|
this.upper = upper;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void add(Token t) {
|
||||||
|
if (count >= lower && count < upper){
|
||||||
|
super.add(t);
|
||||||
|
}
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void reset() throws IOException {
|
||||||
|
count = 0;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,54 @@
|
||||||
|
package org.apache.lucene.analysis.sinks;
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.SinkTokenizer;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If the {@link org.apache.lucene.analysis.Token#type()} matches the passed in <code>typeToMatch</code> then
|
||||||
|
* add it to the sink
|
||||||
|
*
|
||||||
|
**/
|
||||||
|
public class TokenTypeSinkTokenizer extends SinkTokenizer {
|
||||||
|
|
||||||
|
private String typeToMatch;
|
||||||
|
|
||||||
|
public TokenTypeSinkTokenizer(String typeToMatch) {
|
||||||
|
this.typeToMatch = typeToMatch;
|
||||||
|
}
|
||||||
|
|
||||||
|
public TokenTypeSinkTokenizer(int initCap, String typeToMatch) {
|
||||||
|
super(initCap);
|
||||||
|
this.typeToMatch = typeToMatch;
|
||||||
|
}
|
||||||
|
|
||||||
|
public TokenTypeSinkTokenizer(List/*<Token>*/ input, String typeToMatch) {
|
||||||
|
super(input);
|
||||||
|
this.typeToMatch = typeToMatch;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void add(Token t) {
|
||||||
|
//check to see if this is a Category
|
||||||
|
if (t != null && typeToMatch.equals(t.type())){
|
||||||
|
lst.add(t.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,30 @@
|
||||||
|
<HTML>
|
||||||
|
<!--
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
--><HEAD>
|
||||||
|
<TITLE>org.apache.lucene.analysis.sinks</TITLE>
|
||||||
|
</HEAD>
|
||||||
|
<BODY>
|
||||||
|
<DIV>Implementations of the SinkTokenizer that might be useful.
|
||||||
|
</DIV>
|
||||||
|
<DIV> </DIV>
|
||||||
|
<DIV align="center">
|
||||||
|
Copyright © 2007 <A HREF="http://www.apache.org">Apache Software Foundation</A>
|
||||||
|
</DIV>
|
||||||
|
</BODY>
|
||||||
|
</HTML>
|
|
@ -0,0 +1,79 @@
|
||||||
|
package org.apache.lucene.analysis.payloads;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
public class NumericPayloadTokenFilterTest extends TestCase {
|
||||||
|
|
||||||
|
|
||||||
|
public NumericPayloadTokenFilterTest(String s) {
|
||||||
|
super(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void setUp() {
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void tearDown() {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void test() throws IOException {
|
||||||
|
String test = "The quick red fox jumped over the lazy brown dogs";
|
||||||
|
|
||||||
|
NumericPayloadTokenFilter nptf = new NumericPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))), 3, "D");
|
||||||
|
Token tok = new Token();
|
||||||
|
boolean seenDogs = false;
|
||||||
|
while ((tok = nptf.next(tok)) != null){
|
||||||
|
if (tok.termText().equals("dogs")){
|
||||||
|
seenDogs = true;
|
||||||
|
assertTrue(tok.type() + " is not equal to " + "D", tok.type().equals("D") == true);
|
||||||
|
assertTrue("tok.getPayload() is null and it shouldn't be", tok.getPayload() != null);
|
||||||
|
byte [] bytes = tok.getPayload().getData();//safe here to just use the bytes, otherwise we should use offset, length
|
||||||
|
assertTrue(bytes.length + " does not equal: " + tok.getPayload().length(), bytes.length == tok.getPayload().length());
|
||||||
|
assertTrue(tok.getPayload().getOffset() + " does not equal: " + 0, tok.getPayload().getOffset() == 0);
|
||||||
|
float pay = NumericPayloadTokenFilter.decodePayload(bytes);
|
||||||
|
assertTrue(pay + " does not equal: " + 3, pay == 3);
|
||||||
|
} else {
|
||||||
|
assertTrue(tok.type() + " is not null and it should be", tok.type().equals("word"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertTrue(seenDogs + " does not equal: " + true, seenDogs == true);
|
||||||
|
}
|
||||||
|
|
||||||
|
private class WordTokenFilter extends TokenFilter {
|
||||||
|
private WordTokenFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Token next(Token result) throws IOException {
|
||||||
|
result = input.next(result);
|
||||||
|
if (result != null && result.termText().equals("dogs")) {
|
||||||
|
result.setType("D");
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,76 @@
|
||||||
|
package org.apache.lucene.analysis.payloads;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
public class TypeAsPayloadTokenFilterTest extends TestCase {
|
||||||
|
|
||||||
|
|
||||||
|
public TypeAsPayloadTokenFilterTest(String s) {
|
||||||
|
super(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void setUp() {
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void tearDown() {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void test() throws IOException {
|
||||||
|
String test = "The quick red fox jumped over the lazy brown dogs";
|
||||||
|
|
||||||
|
TypeAsPayloadTokenFilter nptf = new TypeAsPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))));
|
||||||
|
Token tok = new Token();
|
||||||
|
int count = 0;
|
||||||
|
while ((tok = nptf.next(tok)) != null){
|
||||||
|
assertTrue(tok.type() + " is not null and it should be", tok.type().equals(String.valueOf(Character.toUpperCase(tok.termBuffer()[0]))));
|
||||||
|
assertTrue("tok.getPayload() is null and it shouldn't be", tok.getPayload() != null);
|
||||||
|
String type = new String(tok.getPayload().getData(), "UTF-8");
|
||||||
|
assertTrue("type is null and it shouldn't be", type != null);
|
||||||
|
assertTrue(type + " is not equal to " + tok.type(), type.equals(tok.type()) == true);
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
assertTrue(count + " does not equal: " + 10, count == 10);
|
||||||
|
}
|
||||||
|
|
||||||
|
private class WordTokenFilter extends TokenFilter {
|
||||||
|
private WordTokenFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
public Token next(Token result) throws IOException {
|
||||||
|
result = input.next(result);
|
||||||
|
if (result != null) {
|
||||||
|
result.setType(String.valueOf(Character.toUpperCase(result.termBuffer()[0])));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,60 @@
|
||||||
|
package org.apache.lucene.analysis.sinks;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
import org.apache.lucene.analysis.TeeTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.text.SimpleDateFormat;
|
||||||
|
|
||||||
|
public class DateRecognizerSinkTokenizerTest extends TestCase {
|
||||||
|
|
||||||
|
|
||||||
|
public DateRecognizerSinkTokenizerTest(String s) {
|
||||||
|
super(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void setUp() {
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void tearDown() {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void test() throws IOException {
|
||||||
|
DateRecognizerSinkTokenizer sink = new DateRecognizerSinkTokenizer(new SimpleDateFormat("MM/dd/yyyy"));
|
||||||
|
String test = "The quick red fox jumped over the lazy brown dogs on 7/11/2006 The dogs finally reacted on 7/12/2006";
|
||||||
|
TeeTokenFilter tee = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(test)), sink);
|
||||||
|
Token tok = null;
|
||||||
|
int count = 0;
|
||||||
|
while ((tok = tee.next()) != null){
|
||||||
|
assertTrue("tok is null and it shouldn't be", tok != null);
|
||||||
|
if (tok.termBuffer()[0] == '7'){
|
||||||
|
assertTrue(tok.type() + " is not equal to " + DateRecognizerSinkTokenizer.DATE_TYPE,
|
||||||
|
tok.type().equals(DateRecognizerSinkTokenizer.DATE_TYPE) == true);
|
||||||
|
}
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
assertTrue(count + " does not equal: " + 18, count == 18);
|
||||||
|
assertTrue("sink Size: " + sink.getTokens().size() + " is not: " + 2, sink.getTokens().size() == 2);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,54 @@
|
||||||
|
package org.apache.lucene.analysis.sinks;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
import org.apache.lucene.analysis.TeeTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
public class TokenRangeSinkTokenizerTest extends TestCase {
|
||||||
|
|
||||||
|
|
||||||
|
public TokenRangeSinkTokenizerTest(String s) {
|
||||||
|
super(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void setUp() {
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void tearDown() {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void test() throws IOException {
|
||||||
|
TokenRangeSinkTokenizer rangeToks = new TokenRangeSinkTokenizer(2, 4);
|
||||||
|
String test = "The quick red fox jumped over the lazy brown dogs";
|
||||||
|
TeeTokenFilter tee = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(test)), rangeToks);
|
||||||
|
Token tok = null;
|
||||||
|
int count = 0;
|
||||||
|
while ((tok = tee.next()) != null){
|
||||||
|
assertTrue("tok is null and it shouldn't be", tok != null);
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
assertTrue(count + " does not equal: " + 10, count == 10);
|
||||||
|
assertTrue("rangeToks Size: " + rangeToks.getTokens().size() + " is not: " + 2, rangeToks.getTokens().size() == 2);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,72 @@
|
||||||
|
package org.apache.lucene.analysis.sinks;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
import org.apache.lucene.analysis.*;
|
||||||
|
import org.apache.lucene.analysis.payloads.NumericPayloadTokenFilter;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
public class TokenTypeSinkTokenizerTest extends TestCase {
|
||||||
|
|
||||||
|
|
||||||
|
public TokenTypeSinkTokenizerTest(String s) {
|
||||||
|
super(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void setUp() {
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void tearDown() {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void test() throws IOException {
|
||||||
|
TokenTypeSinkTokenizer sink = new TokenTypeSinkTokenizer("D");
|
||||||
|
String test = "The quick red fox jumped over the lazy brown dogs";
|
||||||
|
|
||||||
|
TeeTokenFilter ttf = new TeeTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))), sink);
|
||||||
|
Token tok = new Token();
|
||||||
|
boolean seenDogs = false;
|
||||||
|
while ((tok = ttf.next(tok)) != null) {
|
||||||
|
if (tok.termText().equals("dogs")) {
|
||||||
|
seenDogs = true;
|
||||||
|
assertTrue(tok.type() + " is not equal to " + "D", tok.type().equals("D") == true);
|
||||||
|
} else {
|
||||||
|
assertTrue(tok.type() + " is not null and it should be", tok.type().equals("word"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertTrue(seenDogs + " does not equal: " + true, seenDogs == true);
|
||||||
|
assertTrue("sink Size: " + sink.getTokens().size() + " is not: " + 1, sink.getTokens().size() == 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
private class WordTokenFilter extends TokenFilter {
|
||||||
|
private WordTokenFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Token next(Token result) throws IOException {
|
||||||
|
result = input.next(result);
|
||||||
|
if (result != null && result.termText().equals("dogs")) {
|
||||||
|
result.setType("D");
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue