From 55d0c3a2f8f924e7c274b48b9a8379f2210b2225 Mon Sep 17 00:00:00 2001 From: Grant Ingersoll Date: Mon, 17 Dec 2007 13:55:46 +0000 Subject: [PATCH] LUCENE-1077: refactored to have a common PayloadHelper classes. Also added TokenOffsetPayloadTokenFilter, which encodes the Token offset into the payloads git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@604870 13f79535-47bb-0310-9956-ffa450edef68 --- .../payloads/NumericPayloadTokenFilter.java | 38 +--------- .../analysis/payloads/PayloadHelper.java | 70 +++++++++++++++++++ .../TokenOffsetPayloadTokenFilter.java | 52 ++++++++++++++ .../NumericPayloadTokenFilterTest.java | 4 +- .../TokenOffsetPayloadTokenFilterTest.java | 63 +++++++++++++++++ 5 files changed, 189 insertions(+), 38 deletions(-) create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/PayloadHelper.java create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java create mode 100644 contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterTest.java diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java index 13bbc618254..a9f56b8e0d3 100644 --- a/contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java @@ -17,9 +17,9 @@ package org.apache.lucene.analysis.payloads; */ +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Token; import org.apache.lucene.index.Payload; import java.io.IOException; @@ -37,44 +37,10 @@ public class NumericPayloadTokenFilter extends TokenFilter { public NumericPayloadTokenFilter(TokenStream input, float payload, String typeMatch) { super(input); //Need to encode the payload - thePayload = new Payload(encodePayload(payload)); + thePayload = new Payload(PayloadHelper.encodeFloat(payload)); this.typeMatch = typeMatch; } - public static byte[] encodePayload(float payload) { - byte[] result = new byte[4]; - int tmp = Float.floatToIntBits(payload); - result[0] = (byte)(tmp >> 24); - result[1] = (byte)(tmp >> 16); - result[2] = (byte)(tmp >> 8); - result[3] = (byte) tmp; - - return result; - } - - /** - * @see #decodePayload(byte[], int) - * @see #encodePayload(float) - */ - public static float decodePayload(byte [] bytes){ - return decodePayload(bytes, 0); - } - - /** - * Decode the payload that was encoded using {@link #encodePayload(float)}. - * NOTE: the length of the array must be at least offset + 4 long. - * @param bytes The bytes to decode - * @param offset The offset into the array. - * @return The float that was encoded - * - * @see #encodePayload(float) - */ - public static final float decodePayload(byte [] bytes, int offset){ - int tmp = ((bytes[offset] & 0xFF) << 24) | ((bytes[offset + 1] & 0xFF) << 16) - | ((bytes[offset + 2] & 0xFF) << 8) | (bytes[offset + 3] & 0xFF); - return Float.intBitsToFloat(tmp); - } - public Token next(Token result) throws IOException { result = input.next(result); if (result != null && result.type().equals(typeMatch)){ diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/PayloadHelper.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/PayloadHelper.java new file mode 100644 index 00000000000..b15020267c0 --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/PayloadHelper.java @@ -0,0 +1,70 @@ +package org.apache.lucene.analysis.payloads; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * + * + **/ +public class PayloadHelper { + + public static byte[] encodeFloat(float payload) { + return encodeFloat(payload, new byte[4], 0); + } + + public static byte[] encodeFloat(float payload, byte[] data, int offset){ + return encodeInt(Float.floatToIntBits(payload), data, offset); + } + + public static byte[] encodeInt(int payload, byte[] data, int offset){ + data[offset] = (byte)(payload >> 24); + data[offset + 1] = (byte)(payload >> 16); + data[offset + 2] = (byte)(payload >> 8); + data[offset + 3] = (byte) payload; + return data; + } + + /** + * @param bytes + * @see #decodeFloat(byte[], int) + * @see #encodeFloat(float) + * @return the decoded float + */ + public static float decodeFloat(byte [] bytes){ + return decodeFloat(bytes, 0); + } + + /** + * Decode the payload that was encoded using {@link #encodeFloat(float)}. + * NOTE: the length of the array must be at least offset + 4 long. + * @param bytes The bytes to decode + * @param offset The offset into the array. + * @return The float that was encoded + * + * @see # encodeFloat (float) + */ + public static final float decodeFloat(byte [] bytes, int offset){ + + return Float.intBitsToFloat(decodeInt(bytes, offset)); + } + + public static final int decodeInt(byte [] bytes, int offset){ + return ((bytes[offset] & 0xFF) << 24) | ((bytes[offset + 1] & 0xFF) << 16) + | ((bytes[offset + 2] & 0xFF) << 8) | (bytes[offset + 3] & 0xFF); + } +} diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java new file mode 100644 index 00000000000..03bb447a50f --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java @@ -0,0 +1,52 @@ +package org.apache.lucene.analysis.payloads; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.index.Payload; + +import java.io.IOException; + + +/** + * Adds the {@link org.apache.lucene.analysis.Token#setStartOffset(int)} + * and {@link org.apache.lucene.analysis.Token#setEndOffset(int)} + * First 4 bytes are the start + * + **/ +public class TokenOffsetPayloadTokenFilter extends TokenFilter { + + + public TokenOffsetPayloadTokenFilter(TokenStream input) { + super(input); + } + + public Token next(Token result) throws IOException { + result = input.next(result); + if (result != null){ + byte[] data = new byte[8]; + PayloadHelper.encodeInt(result.startOffset(), data, 0); + PayloadHelper.encodeInt(result.endOffset(), data, 4); + Payload payload = new Payload(data); + result.setPayload(payload); + } + return result; + } +} \ No newline at end of file diff --git a/contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterTest.java b/contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterTest.java index edfbb72abd0..1ed0dcc273e 100644 --- a/contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterTest.java +++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterTest.java @@ -17,9 +17,9 @@ package org.apache.lucene.analysis.payloads; */ import junit.framework.TestCase; +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.WhitespaceTokenizer; import java.io.IOException; @@ -53,7 +53,7 @@ public class NumericPayloadTokenFilterTest extends TestCase { byte [] bytes = tok.getPayload().getData();//safe here to just use the bytes, otherwise we should use offset, length assertTrue(bytes.length + " does not equal: " + tok.getPayload().length(), bytes.length == tok.getPayload().length()); assertTrue(tok.getPayload().getOffset() + " does not equal: " + 0, tok.getPayload().getOffset() == 0); - float pay = NumericPayloadTokenFilter.decodePayload(bytes); + float pay = PayloadHelper.decodeFloat(bytes); assertTrue(pay + " does not equal: " + 3, pay == 3); } else { assertTrue(tok.type() + " is not null and it should be", tok.type().equals("word")); diff --git a/contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterTest.java b/contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterTest.java new file mode 100644 index 00000000000..96e3a820aa3 --- /dev/null +++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterTest.java @@ -0,0 +1,63 @@ +package org.apache.lucene.analysis.payloads; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.lucene.index.Payload; + +import java.io.IOException; +import java.io.StringReader; + +public class TokenOffsetPayloadTokenFilterTest extends TestCase { + + + public TokenOffsetPayloadTokenFilterTest(String s) { + super(s); + } + + protected void setUp() { + } + + protected void tearDown() { + + } + + public void test() throws IOException { + String test = "The quick red fox jumped over the lazy brown dogs"; + + TokenOffsetPayloadTokenFilter nptf = new TokenOffsetPayloadTokenFilter(new WhitespaceTokenizer(new StringReader(test))); + Token tok = new Token(); + int count = 0; + while ((tok = nptf.next(tok)) != null){ + assertTrue("tok is null and it shouldn't be", tok != null); + Payload pay = tok.getPayload(); + assertTrue("pay is null and it shouldn't be", pay != null); + byte [] data = pay.getData(); + int start = PayloadHelper.decodeInt(data, 0); + assertTrue(start + " does not equal: " + tok.startOffset(), start == tok.startOffset()); + int end = PayloadHelper.decodeInt(data, 4); + assertTrue(end + " does not equal: " + tok.endOffset(), end == tok.endOffset()); + count++; + } + assertTrue(count + " does not equal: " + 10, count == 10); + + } + + +} \ No newline at end of file