HBASE-11907 Use the joni byte[] regex engine in place of j.u.regex

This commit is contained in:
Andrew Purtell 2014-10-02 23:06:33 -07:00
parent 1dd7030701
commit 5881eed36e
6 changed files with 656 additions and 44 deletions

View File

@ -134,6 +134,10 @@
<groupId>org.codehaus.jackson</groupId> <groupId>org.codehaus.jackson</groupId>
<artifactId>jackson-mapper-asl</artifactId> <artifactId>jackson-mapper-asl</artifactId>
</dependency> </dependency>
<dependency>
<groupId>org.jruby.joni</groupId>
<artifactId>joni</artifactId>
</dependency>
<dependency> <dependency>
<groupId>log4j</groupId> <groupId>log4j</groupId>
<artifactId>log4j</artifactId> <artifactId>log4j</artifactId>

View File

@ -19,20 +19,28 @@
package org.apache.hadoop.hbase.filter; package org.apache.hadoop.hbase.filter;
import com.google.protobuf.InvalidProtocolBufferException; import com.google.protobuf.InvalidProtocolBufferException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.classification.InterfaceStability;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.exceptions.DeserializationException;
import org.apache.hadoop.hbase.protobuf.generated.ComparatorProtos;
import org.apache.hadoop.hbase.util.Bytes;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.IllegalCharsetNameException;
import java.util.Arrays; import java.util.Arrays;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.classification.InterfaceStability;
import org.apache.hadoop.hbase.exceptions.DeserializationException;
import org.apache.hadoop.hbase.protobuf.generated.ComparatorProtos;
import org.apache.hadoop.hbase.util.Bytes;
import org.jcodings.Encoding;
import org.jcodings.EncodingDB;
import org.jcodings.specific.UTF8Encoding;
import org.joni.Matcher;
import org.joni.Option;
import org.joni.Regex;
import org.joni.Syntax;
/** /**
* This comparator is for use with {@link CompareFilter} implementations, such * This comparator is for use with {@link CompareFilter} implementations, such
* as {@link RowFilter}, {@link QualifierFilter}, and {@link ValueFilter}, for * as {@link RowFilter}, {@link QualifierFilter}, and {@link ValueFilter}, for
@ -69,9 +77,13 @@ public class RegexStringComparator extends ByteArrayComparable {
private static final Log LOG = LogFactory.getLog(RegexStringComparator.class); private static final Log LOG = LogFactory.getLog(RegexStringComparator.class);
private Charset charset = HConstants.UTF8_CHARSET; private Engine engine;
private Pattern pattern; /** Engine implementation type (default=JAVA) */
public enum EngineType {
JAVA,
JONI
}
/** /**
* Constructor * Constructor
@ -82,14 +94,41 @@ public class RegexStringComparator extends ByteArrayComparable {
this(expr, Pattern.DOTALL); this(expr, Pattern.DOTALL);
} }
/**
* Constructor
* Adds Pattern.DOTALL to the underlying Pattern
* @param expr a valid regular expression
* @param engine engine implementation type
*/
public RegexStringComparator(String expr, EngineType engine) {
this(expr, Pattern.DOTALL, engine);
}
/** /**
* Constructor * Constructor
* @param expr a valid regular expression * @param expr a valid regular expression
* @param flags java.util.regex.Pattern flags * @param flags java.util.regex.Pattern flags
*/ */
public RegexStringComparator(String expr, int flags) { public RegexStringComparator(String expr, int flags) {
this(expr, flags, EngineType.JAVA);
}
/**
* Constructor
* @param expr a valid regular expression
* @param flags java.util.regex.Pattern flags
* @param engine engine implementation type
*/
public RegexStringComparator(String expr, int flags, EngineType engine) {
super(Bytes.toBytes(expr)); super(Bytes.toBytes(expr));
this.pattern = Pattern.compile(expr, flags); switch (engine) {
case JAVA:
this.engine = new JavaRegexEngine(expr, flags);
break;
case JONI:
this.engine = new JoniRegexEngine(expr, flags);
break;
}
} }
/** /**
@ -104,34 +143,19 @@ public class RegexStringComparator extends ByteArrayComparable {
* @param charset The charset to use. * @param charset The charset to use.
*/ */
public void setCharset(final Charset charset) { public void setCharset(final Charset charset) {
this.charset = charset; engine.setCharset(charset.name());
} }
@Override @Override
public int compareTo(byte[] value, int offset, int length) { public int compareTo(byte[] value, int offset, int length) {
// Use find() for subsequence match instead of matches() (full sequence return engine.compareTo(value, offset, length);
// match) to adhere to the principle of least surprise.
String tmp;
if (length < value.length / 2) {
// See HBASE-9428. Make a copy of the relevant part of the byte[],
// or the JDK will copy the entire byte[] during String decode
tmp = new String(Arrays.copyOfRange(value, offset, offset + length), charset);
} else {
tmp = new String(value, offset, length, charset);
}
return pattern.matcher(tmp).find() ? 0 : 1;
} }
/** /**
* @return The comparator serialized using pb * @return The comparator serialized using pb
*/ */
public byte [] toByteArray() { public byte [] toByteArray() {
ComparatorProtos.RegexStringComparator.Builder builder = return engine.toByteArray();
ComparatorProtos.RegexStringComparator.newBuilder();
builder.setPattern(pattern.toString());
builder.setPatternFlags(pattern.flags());
builder.setCharset(charset.name());
return builder.build().toByteArray();
} }
/** /**
@ -148,13 +172,18 @@ public class RegexStringComparator extends ByteArrayComparable {
} catch (InvalidProtocolBufferException e) { } catch (InvalidProtocolBufferException e) {
throw new DeserializationException(e); throw new DeserializationException(e);
} }
RegexStringComparator comparator;
RegexStringComparator comparator = if (proto.hasEngine()) {
new RegexStringComparator(proto.getPattern(), proto.getPatternFlags()); EngineType engine = EngineType.valueOf(proto.getEngine());
final String charset = proto.getCharset(); comparator = new RegexStringComparator(proto.getPattern(), proto.getPatternFlags(),
engine);
} else {
comparator = new RegexStringComparator(proto.getPattern(), proto.getPatternFlags());
}
String charset = proto.getCharset();
if (charset.length() > 0) { if (charset.length() > 0) {
try { try {
comparator.setCharset(Charset.forName(charset)); comparator.getEngine().setCharset(charset);
} catch (IllegalCharsetNameException e) { } catch (IllegalCharsetNameException e) {
LOG.error("invalid charset", e); LOG.error("invalid charset", e);
} }
@ -170,11 +199,221 @@ public class RegexStringComparator extends ByteArrayComparable {
boolean areSerializedFieldsEqual(ByteArrayComparable other) { boolean areSerializedFieldsEqual(ByteArrayComparable other) {
if (other == this) return true; if (other == this) return true;
if (!(other instanceof RegexStringComparator)) return false; if (!(other instanceof RegexStringComparator)) return false;
RegexStringComparator comparator = (RegexStringComparator)other; RegexStringComparator comparator = (RegexStringComparator)other;
return super.areSerializedFieldsEqual(comparator) return super.areSerializedFieldsEqual(comparator)
&& this.pattern.toString().equals(comparator.pattern.toString()) && engine.getClass().isInstance(comparator.getEngine())
&& this.pattern.flags() == comparator.pattern.flags() && engine.getPattern().equals(comparator.getEngine().getPattern())
&& this.charset.equals(comparator.charset); && engine.getFlags() == comparator.getEngine().getFlags()
&& engine.getCharset().equals(comparator.getEngine().getCharset());
}
Engine getEngine() {
return engine;
}
/**
* This is an internal interface for abstracting access to different regular
* expression matching engines.
*/
static interface Engine {
/**
* Returns the string representation of the configured regular expression
* for matching
*/
String getPattern();
/**
* Returns the set of configured match flags, a bit mask that may include
* {@link Pattern} flags
*/
int getFlags();
/**
* Returns the name of the configured charset
*/
String getCharset();
/**
* Set the charset used when matching
* @param charset the name of the desired charset for matching
*/
void setCharset(final String charset);
/**
* Return the serialized form of the configured matcher
*/
byte [] toByteArray();
/**
* Match the given input against the configured pattern
* @param value the data to be matched
* @param offset offset of the data to be matched
* @param length length of the data to be matched
* @return 0 if a match was made, 1 otherwise
*/
int compareTo(byte[] value, int offset, int length);
}
/**
* Implementation of the Engine interface using Java's Pattern.
* <p>
* This is the default engine.
*/
static class JavaRegexEngine implements Engine {
private Charset charset = Charset.forName("UTF-8");
private Pattern pattern;
public JavaRegexEngine(String regex, int flags) {
this.pattern = Pattern.compile(regex, flags);
}
@Override
public String getPattern() {
return pattern.toString();
}
@Override
public int getFlags() {
return pattern.flags();
}
@Override
public String getCharset() {
return charset.name();
}
@Override
public void setCharset(String charset) {
this.charset = Charset.forName(charset);
}
@Override
public int compareTo(byte[] value, int offset, int length) {
// Use find() for subsequence match instead of matches() (full sequence
// match) to adhere to the principle of least surprise.
String tmp;
if (length < value.length / 2) {
// See HBASE-9428. Make a copy of the relevant part of the byte[],
// or the JDK will copy the entire byte[] during String decode
tmp = new String(Arrays.copyOfRange(value, offset, offset + length), charset);
} else {
tmp = new String(value, offset, length, charset);
}
return pattern.matcher(tmp).find() ? 0 : 1;
}
@Override
public byte[] toByteArray() {
ComparatorProtos.RegexStringComparator.Builder builder =
ComparatorProtos.RegexStringComparator.newBuilder();
builder.setPattern(pattern.pattern());
builder.setPatternFlags(pattern.flags());
builder.setCharset(charset.name());
builder.setEngine(EngineType.JAVA.name());
return builder.build().toByteArray();
}
}
/**
* Implementation of the Engine interface using Jruby's joni regex engine.
* <p>
* This engine operates on byte arrays directly so is expected to be more GC
* friendly, and reportedly is twice as fast as Java's Pattern engine.
* <p>
* NOTE: Only the {@link Pattern} flags CASE_INSENSITIVE, DOTALL, and
* MULTILINE are supported.
*/
static class JoniRegexEngine implements Engine {
private Encoding encoding = UTF8Encoding.INSTANCE;
private String regex;
private Regex pattern;
public JoniRegexEngine(String regex, int flags) {
this.regex = regex;
byte[] b = Bytes.toBytes(regex);
this.pattern = new Regex(b, 0, b.length, patternToJoniFlags(flags), encoding, Syntax.Java);
}
@Override
public String getPattern() {
return regex;
}
@Override
public int getFlags() {
return pattern.getOptions();
}
@Override
public String getCharset() {
return encoding.getCharsetName();
}
@Override
public void setCharset(String name) {
setEncoding(name);
}
@Override
public int compareTo(byte[] value, int offset, int length) {
// Use subsequence match instead of full sequence match to adhere to the
// principle of least surprise.
Matcher m = pattern.matcher(value);
return m.search(offset, length, pattern.getOptions()) < 0 ? 1 : 0;
}
@Override
public byte[] toByteArray() {
ComparatorProtos.RegexStringComparator.Builder builder =
ComparatorProtos.RegexStringComparator.newBuilder();
builder.setPattern(regex);
builder.setPatternFlags(joniToPatternFlags(pattern.getOptions()));
builder.setCharset(encoding.getCharsetName());
builder.setEngine(EngineType.JONI.name());
return builder.build().toByteArray();
}
private int patternToJoniFlags(int flags) {
int newFlags = 0;
if ((flags & Pattern.CASE_INSENSITIVE) != 0) {
newFlags |= Option.IGNORECASE;
}
if ((flags & Pattern.DOTALL) != 0) {
// This does NOT mean Pattern.MULTILINE
newFlags |= Option.MULTILINE;
}
if ((flags & Pattern.MULTILINE) != 0) {
// This is what Java 8's Nashorn engine does when using joni and
// translating Pattern's MULTILINE flag
newFlags &= ~Option.SINGLELINE;
newFlags |= Option.NEGATE_SINGLELINE;
}
return newFlags;
}
private int joniToPatternFlags(int flags) {
int newFlags = 0;
if ((flags & Option.IGNORECASE) != 0) {
newFlags |= Pattern.CASE_INSENSITIVE;
}
// This does NOT mean Pattern.MULTILINE, this is equivalent to Pattern.DOTALL
if ((flags & Option.MULTILINE) != 0) {
newFlags |= Pattern.DOTALL;
}
// This means Pattern.MULTILINE. Nice
if ((flags & Option.NEGATE_SINGLELINE) != 0) {
newFlags |= Pattern.MULTILINE;
}
return newFlags;
}
private void setEncoding(String name) {
EncodingDB.Entry e = EncodingDB.getEncodings().get(Bytes.toBytes(name));
if (e != null) {
encoding = e.getEncoding();
} else {
throw new IllegalCharsetNameException(name);
}
}
} }
} }

View File

@ -3292,6 +3292,21 @@ public final class ComparatorProtos {
*/ */
com.google.protobuf.ByteString com.google.protobuf.ByteString
getCharsetBytes(); getCharsetBytes();
// optional string engine = 4;
/**
* <code>optional string engine = 4;</code>
*/
boolean hasEngine();
/**
* <code>optional string engine = 4;</code>
*/
java.lang.String getEngine();
/**
* <code>optional string engine = 4;</code>
*/
com.google.protobuf.ByteString
getEngineBytes();
} }
/** /**
* Protobuf type {@code RegexStringComparator} * Protobuf type {@code RegexStringComparator}
@ -3359,6 +3374,11 @@ public final class ComparatorProtos {
charset_ = input.readBytes(); charset_ = input.readBytes();
break; break;
} }
case 34: {
bitField0_ |= 0x00000008;
engine_ = input.readBytes();
break;
}
} }
} }
} catch (com.google.protobuf.InvalidProtocolBufferException e) { } catch (com.google.protobuf.InvalidProtocolBufferException e) {
@ -3501,10 +3521,54 @@ public final class ComparatorProtos {
} }
} }
// optional string engine = 4;
public static final int ENGINE_FIELD_NUMBER = 4;
private java.lang.Object engine_;
/**
* <code>optional string engine = 4;</code>
*/
public boolean hasEngine() {
return ((bitField0_ & 0x00000008) == 0x00000008);
}
/**
* <code>optional string engine = 4;</code>
*/
public java.lang.String getEngine() {
java.lang.Object ref = engine_;
if (ref instanceof java.lang.String) {
return (java.lang.String) ref;
} else {
com.google.protobuf.ByteString bs =
(com.google.protobuf.ByteString) ref;
java.lang.String s = bs.toStringUtf8();
if (bs.isValidUtf8()) {
engine_ = s;
}
return s;
}
}
/**
* <code>optional string engine = 4;</code>
*/
public com.google.protobuf.ByteString
getEngineBytes() {
java.lang.Object ref = engine_;
if (ref instanceof java.lang.String) {
com.google.protobuf.ByteString b =
com.google.protobuf.ByteString.copyFromUtf8(
(java.lang.String) ref);
engine_ = b;
return b;
} else {
return (com.google.protobuf.ByteString) ref;
}
}
private void initFields() { private void initFields() {
pattern_ = ""; pattern_ = "";
patternFlags_ = 0; patternFlags_ = 0;
charset_ = ""; charset_ = "";
engine_ = "";
} }
private byte memoizedIsInitialized = -1; private byte memoizedIsInitialized = -1;
public final boolean isInitialized() { public final boolean isInitialized() {
@ -3539,6 +3603,9 @@ public final class ComparatorProtos {
if (((bitField0_ & 0x00000004) == 0x00000004)) { if (((bitField0_ & 0x00000004) == 0x00000004)) {
output.writeBytes(3, getCharsetBytes()); output.writeBytes(3, getCharsetBytes());
} }
if (((bitField0_ & 0x00000008) == 0x00000008)) {
output.writeBytes(4, getEngineBytes());
}
getUnknownFields().writeTo(output); getUnknownFields().writeTo(output);
} }
@ -3560,6 +3627,10 @@ public final class ComparatorProtos {
size += com.google.protobuf.CodedOutputStream size += com.google.protobuf.CodedOutputStream
.computeBytesSize(3, getCharsetBytes()); .computeBytesSize(3, getCharsetBytes());
} }
if (((bitField0_ & 0x00000008) == 0x00000008)) {
size += com.google.protobuf.CodedOutputStream
.computeBytesSize(4, getEngineBytes());
}
size += getUnknownFields().getSerializedSize(); size += getUnknownFields().getSerializedSize();
memoizedSerializedSize = size; memoizedSerializedSize = size;
return size; return size;
@ -3598,6 +3669,11 @@ public final class ComparatorProtos {
result = result && getCharset() result = result && getCharset()
.equals(other.getCharset()); .equals(other.getCharset());
} }
result = result && (hasEngine() == other.hasEngine());
if (hasEngine()) {
result = result && getEngine()
.equals(other.getEngine());
}
result = result && result = result &&
getUnknownFields().equals(other.getUnknownFields()); getUnknownFields().equals(other.getUnknownFields());
return result; return result;
@ -3623,6 +3699,10 @@ public final class ComparatorProtos {
hash = (37 * hash) + CHARSET_FIELD_NUMBER; hash = (37 * hash) + CHARSET_FIELD_NUMBER;
hash = (53 * hash) + getCharset().hashCode(); hash = (53 * hash) + getCharset().hashCode();
} }
if (hasEngine()) {
hash = (37 * hash) + ENGINE_FIELD_NUMBER;
hash = (53 * hash) + getEngine().hashCode();
}
hash = (29 * hash) + getUnknownFields().hashCode(); hash = (29 * hash) + getUnknownFields().hashCode();
memoizedHashCode = hash; memoizedHashCode = hash;
return hash; return hash;
@ -3738,6 +3818,8 @@ public final class ComparatorProtos {
bitField0_ = (bitField0_ & ~0x00000002); bitField0_ = (bitField0_ & ~0x00000002);
charset_ = ""; charset_ = "";
bitField0_ = (bitField0_ & ~0x00000004); bitField0_ = (bitField0_ & ~0x00000004);
engine_ = "";
bitField0_ = (bitField0_ & ~0x00000008);
return this; return this;
} }
@ -3778,6 +3860,10 @@ public final class ComparatorProtos {
to_bitField0_ |= 0x00000004; to_bitField0_ |= 0x00000004;
} }
result.charset_ = charset_; result.charset_ = charset_;
if (((from_bitField0_ & 0x00000008) == 0x00000008)) {
to_bitField0_ |= 0x00000008;
}
result.engine_ = engine_;
result.bitField0_ = to_bitField0_; result.bitField0_ = to_bitField0_;
onBuilt(); onBuilt();
return result; return result;
@ -3807,6 +3893,11 @@ public final class ComparatorProtos {
charset_ = other.charset_; charset_ = other.charset_;
onChanged(); onChanged();
} }
if (other.hasEngine()) {
bitField0_ |= 0x00000008;
engine_ = other.engine_;
onChanged();
}
this.mergeUnknownFields(other.getUnknownFields()); this.mergeUnknownFields(other.getUnknownFields());
return this; return this;
} }
@ -4027,6 +4118,80 @@ public final class ComparatorProtos {
return this; return this;
} }
// optional string engine = 4;
private java.lang.Object engine_ = "";
/**
* <code>optional string engine = 4;</code>
*/
public boolean hasEngine() {
return ((bitField0_ & 0x00000008) == 0x00000008);
}
/**
* <code>optional string engine = 4;</code>
*/
public java.lang.String getEngine() {
java.lang.Object ref = engine_;
if (!(ref instanceof java.lang.String)) {
java.lang.String s = ((com.google.protobuf.ByteString) ref)
.toStringUtf8();
engine_ = s;
return s;
} else {
return (java.lang.String) ref;
}
}
/**
* <code>optional string engine = 4;</code>
*/
public com.google.protobuf.ByteString
getEngineBytes() {
java.lang.Object ref = engine_;
if (ref instanceof String) {
com.google.protobuf.ByteString b =
com.google.protobuf.ByteString.copyFromUtf8(
(java.lang.String) ref);
engine_ = b;
return b;
} else {
return (com.google.protobuf.ByteString) ref;
}
}
/**
* <code>optional string engine = 4;</code>
*/
public Builder setEngine(
java.lang.String value) {
if (value == null) {
throw new NullPointerException();
}
bitField0_ |= 0x00000008;
engine_ = value;
onChanged();
return this;
}
/**
* <code>optional string engine = 4;</code>
*/
public Builder clearEngine() {
bitField0_ = (bitField0_ & ~0x00000008);
engine_ = getDefaultInstance().getEngine();
onChanged();
return this;
}
/**
* <code>optional string engine = 4;</code>
*/
public Builder setEngineBytes(
com.google.protobuf.ByteString value) {
if (value == null) {
throw new NullPointerException();
}
bitField0_ |= 0x00000008;
engine_ = value;
onChanged();
return this;
}
// @@protoc_insertion_point(builder_scope:RegexStringComparator) // @@protoc_insertion_point(builder_scope:RegexStringComparator)
} }
@ -4614,12 +4779,12 @@ public final class ComparatorProtos {
"\002(\0132\024.ByteArrayComparable\022,\n\nbitwise_op\030" + "\002(\0132\024.ByteArrayComparable\022,\n\nbitwise_op\030" +
"\002 \002(\0162\030.BitComparator.BitwiseOp\"%\n\tBitwi" + "\002 \002(\0162\030.BitComparator.BitwiseOp\"%\n\tBitwi" +
"seOp\022\007\n\003AND\020\001\022\006\n\002OR\020\002\022\007\n\003XOR\020\003\"\020\n\016NullCo", "seOp\022\007\n\003AND\020\001\022\006\n\002OR\020\002\022\007\n\003XOR\020\003\"\020\n\016NullCo",
"mparator\"P\n\025RegexStringComparator\022\017\n\007pat" + "mparator\"`\n\025RegexStringComparator\022\017\n\007pat" +
"tern\030\001 \002(\t\022\025\n\rpattern_flags\030\002 \002(\005\022\017\n\007cha" + "tern\030\001 \002(\t\022\025\n\rpattern_flags\030\002 \002(\005\022\017\n\007cha" +
"rset\030\003 \002(\t\"%\n\023SubstringComparator\022\016\n\006sub" + "rset\030\003 \002(\t\022\016\n\006engine\030\004 \001(\t\"%\n\023SubstringC" +
"str\030\001 \002(\tBF\n*org.apache.hadoop.hbase.pro" + "omparator\022\016\n\006substr\030\001 \002(\tBF\n*org.apache." +
"tobuf.generatedB\020ComparatorProtosH\001\210\001\001\240\001" + "hadoop.hbase.protobuf.generatedB\020Compara" +
"\001" "torProtosH\001\210\001\001\240\001\001"
}; };
com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner assigner = com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner assigner =
new com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner() { new com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner() {
@ -4667,7 +4832,7 @@ public final class ComparatorProtos {
internal_static_RegexStringComparator_fieldAccessorTable = new internal_static_RegexStringComparator_fieldAccessorTable = new
com.google.protobuf.GeneratedMessage.FieldAccessorTable( com.google.protobuf.GeneratedMessage.FieldAccessorTable(
internal_static_RegexStringComparator_descriptor, internal_static_RegexStringComparator_descriptor,
new java.lang.String[] { "Pattern", "PatternFlags", "Charset", }); new java.lang.String[] { "Pattern", "PatternFlags", "Charset", "Engine", });
internal_static_SubstringComparator_descriptor = internal_static_SubstringComparator_descriptor =
getDescriptor().getMessageTypes().get(7); getDescriptor().getMessageTypes().get(7);
internal_static_SubstringComparator_fieldAccessorTable = new internal_static_SubstringComparator_fieldAccessorTable = new

View File

@ -61,6 +61,7 @@ message RegexStringComparator {
required string pattern = 1; required string pattern = 1;
required int32 pattern_flags = 2; required int32 pattern_flags = 2;
required string charset = 3; required string charset = 3;
optional string engine = 4;
} }
message SubstringComparator { message SubstringComparator {

View File

@ -0,0 +1,197 @@
/**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.filter;
import static org.junit.Assert.*;
import java.util.regex.Pattern;
import org.apache.hadoop.hbase.filter.RegexStringComparator.EngineType;
import org.apache.hadoop.hbase.testclassification.FilterTests;
import org.apache.hadoop.hbase.testclassification.SmallTests;
import org.apache.hadoop.hbase.util.Bytes;
import org.junit.Test;
import org.junit.experimental.categories.Category;
@Category({FilterTests.class, SmallTests.class})
public class TestRegexComparator {
@Test
public void testSerialization() throws Exception {
// Default engine is the Java engine
RegexStringComparator a = new RegexStringComparator("a|b");
RegexStringComparator b = RegexStringComparator.parseFrom(a.toByteArray());
assertTrue(a.areSerializedFieldsEqual(b));
assertTrue(b.getEngine() instanceof RegexStringComparator.JavaRegexEngine);
// joni engine
a = new RegexStringComparator("a|b", EngineType.JONI);
b = RegexStringComparator.parseFrom(a.toByteArray());
assertTrue(a.areSerializedFieldsEqual(b));
assertTrue(b.getEngine() instanceof RegexStringComparator.JoniRegexEngine);
}
@Test
public void testJavaEngine() throws Exception {
for (TestCase t: TEST_CASES) {
boolean result = new RegexStringComparator(t.regex, t.flags, EngineType.JAVA)
.compareTo(Bytes.toBytes(t.haystack)) == 0;
assertEquals("Regex '" + t.regex + "' failed test '" + t.haystack + "'", result,
t.expected);
}
}
@Test
public void testJoniEngine() throws Exception {
for (TestCase t: TEST_CASES) {
boolean result = new RegexStringComparator(t.regex, t.flags, EngineType.JONI)
.compareTo(Bytes.toBytes(t.haystack)) == 0;
assertEquals("Regex '" + t.regex + "' failed test '" + t.haystack + "'", result,
t.expected);
}
}
private static class TestCase {
String regex;
String haystack;
int flags;
boolean expected;
public TestCase(String regex, String haystack, boolean expected) {
this(regex, Pattern.DOTALL, haystack, expected);
}
public TestCase(String regex, int flags, String haystack, boolean expected) {
this.regex = regex;
this.flags = flags;
this.haystack = haystack;
this.expected = expected;
}
}
// These are a subset of the regex tests from OpenJDK 7
private static TestCase TEST_CASES[] = {
new TestCase("a|b", "a", true),
new TestCase("a|b", "b", true),
new TestCase("a|b", Pattern.CASE_INSENSITIVE, "A", true),
new TestCase("a|b", Pattern.CASE_INSENSITIVE, "B", true),
new TestCase("a|b", "z", false),
new TestCase("a|b|cd", "cd", true),
new TestCase("z(a|ac)b", "zacb", true),
new TestCase("[abc]+", "ababab", true),
new TestCase("[abc]+", "defg", false),
new TestCase("[abc]+[def]+[ghi]+", "zzzaaddggzzz", true),
new TestCase("[a-\\u4444]+", "za-9z", true),
new TestCase("[^abc]+", "ababab", false),
new TestCase("[^abc]+", "aaabbbcccdefg", true),
new TestCase("[abc^b]", "b", true),
new TestCase("[abc[def]]", "b", true),
new TestCase("[abc[def]]", "e", true),
new TestCase("[a-c[d-f[g-i]]]", "h", true),
new TestCase("[a-c[d-f[g-i]]m]", "m", true),
new TestCase("[a-c&&[d-f]]", "a", false),
new TestCase("[a-c&&[d-f]]", "z", false),
new TestCase("[a-m&&m-z&&a-c]", "m", false),
new TestCase("[a-m&&m-z&&a-z]", "m", true),
new TestCase("[[a-m]&&[^a-c]]", "a", false),
new TestCase("[[a-m]&&[^a-c]]", "d", true),
new TestCase("[[a-c][d-f]&&abc[def]]", "e", true),
new TestCase("[[a-c]&&[b-d]&&[c-e]]", "c", true),
new TestCase("[[a-c]&&[b-d][c-e]&&[u-z]]", "c", false),
new TestCase("[[a]&&[b][c][a]&&[^d]]", "a", true),
new TestCase("[[a]&&[b][c][a]&&[^d]]", "d", false),
new TestCase("[[[a-d]&&[c-f]]&&[c]&&c&&[cde]]", "c", true),
new TestCase("[x[[wz]abc&&bcd[z]]&&[u-z]]", "z", true),
new TestCase("a.c.+", "a#c%&", true),
new TestCase("ab.", "ab\n", true),
new TestCase("(?s)ab.", "ab\n", true),
new TestCase("ab\\wc", "abcc", true),
new TestCase("\\W\\w\\W", "#r#", true),
new TestCase("\\W\\w\\W", "rrrr#ggg", false),
new TestCase("abc[\\sdef]*", "abc def", true),
new TestCase("abc[\\sy-z]*", "abc y z", true),
new TestCase("abc[a-d\\sm-p]*", "abcaa mn p", true),
new TestCase("\\s\\s\\s", "blah err", false),
new TestCase("\\S\\S\\s", "blah err", true),
new TestCase("ab\\dc", "ab9c", true),
new TestCase("\\d\\d\\d", "blah45", false),
new TestCase("^abc", "abcdef", true),
new TestCase("^abc", "bcdabc", false),
new TestCase("^(a)?a", "a", true),
new TestCase("^(aa(bb)?)+$", "aabbaa", true),
new TestCase("((a|b)?b)+", "b", true),
new TestCase("^(a(b)?)+$", "aba", true),
new TestCase("^(a(b(c)?)?)?abc", "abc", true),
new TestCase("^(a(b(c))).*", "abc", true),
new TestCase("a?b", "aaaab", true),
new TestCase("a?b", "aaacc", false),
new TestCase("a??b", "aaaab", true),
new TestCase("a??b", "aaacc", false),
new TestCase("a?+b", "aaaab", true),
new TestCase("a?+b", "aaacc", false),
new TestCase("a+b", "aaaab", true),
new TestCase("a+b", "aaacc", false),
new TestCase("a+?b", "aaaab", true),
new TestCase("a+?b", "aaacc", false),
new TestCase("a++b", "aaaab", true),
new TestCase("a++b", "aaacc", false),
new TestCase("a{2,3}", "a", false),
new TestCase("a{2,3}", "aa", true),
new TestCase("a{2,3}", "aaa", true),
new TestCase("a{3,}", "zzzaaaazzz", true),
new TestCase("a{3,}", "zzzaazzz", false),
new TestCase("abc(?=d)", "zzzabcd", true),
new TestCase("abc(?=d)", "zzzabced", false),
new TestCase("abc(?!d)", "zzabcd", false),
new TestCase("abc(?!d)", "zzabced", true),
new TestCase("\\w(?<=a)", "###abc###", true),
new TestCase("\\w(?<=a)", "###ert###", false),
new TestCase("(?<!a)c", "bc", true),
new TestCase("(?<!a)c", "ac", false),
new TestCase("(a+b)+", "ababab", true),
new TestCase("(a+b)+", "accccd", false),
new TestCase("(ab)+", "ababab", true),
new TestCase("(ab)+", "accccd", false),
new TestCase("(ab)(cd*)", "zzzabczzz", true),
new TestCase("abc(d)*abc", "abcdddddabc", true),
new TestCase("a*b", "aaaab", true),
new TestCase("a*b", "b", true),
new TestCase("a*b", "aaaac", false),
new TestCase(".*?b", "aaaab", true),
new TestCase("a*+b", "aaaab", true),
new TestCase("a*+b", "b", true),
new TestCase("a*+b", "aaaac", false),
new TestCase("(?i)foobar", "fOobAr", true),
new TestCase("f(?i)oobar", "fOobAr", true),
new TestCase("f(?i)oobar", "FOobAr", false),
new TestCase("foo(?i)bar", "fOobAr", false),
new TestCase("(?i)foo[bar]+", "foObAr", true),
new TestCase("(?i)foo[a-r]+", "foObAr", true),
new TestCase("abc(?x)blah", "abcblah", true),
new TestCase("abc(?x) blah", "abcblah", true),
new TestCase("abc(?x) blah blech", "abcblahblech", true),
new TestCase("[\\n-#]", "!", true),
new TestCase("[\\n-#]", "-", false),
new TestCase("[\\043]+", "blahblah#blech", true),
new TestCase("[\\042-\\044]+", "blahblah#blech", true),
new TestCase("[\\u1234-\\u1236]", "blahblah\u1235blech", true),
new TestCase("[^\043]*", "blahblah#blech", true),
new TestCase("(|f)?+", "foo", true),
};
}

View File

@ -936,6 +936,7 @@
<jamon-runtime.version>2.3.1</jamon-runtime.version> <jamon-runtime.version>2.3.1</jamon-runtime.version>
<jettison.version>1.3.1</jettison.version> <jettison.version>1.3.1</jettison.version>
<netty.version>4.0.19.Final</netty.version> <netty.version>4.0.19.Final</netty.version>
<joni.version>2.1.2</joni.version>
<!-- Plugin Dependencies --> <!-- Plugin Dependencies -->
<maven.assembly.version>2.4</maven.assembly.version> <maven.assembly.version>2.4</maven.assembly.version>
<maven.antrun.version>1.6</maven.antrun.version> <maven.antrun.version>1.6</maven.antrun.version>
@ -1233,6 +1234,11 @@
</exclusion> </exclusion>
</exclusions> </exclusions>
</dependency> </dependency>
<dependency>
<groupId>org.jruby.joni</groupId>
<artifactId>joni</artifactId>
<version>${joni.version}</version>
</dependency>
<dependency> <dependency>
<groupId>org.mortbay.jetty</groupId> <groupId>org.mortbay.jetty</groupId>
<artifactId>jetty-util</artifactId> <artifactId>jetty-util</artifactId>