diff --git a/src/main/java/org/elasticsearch/index/fielddata/plain/AbstractBytesIndexFieldData.java b/src/main/java/org/elasticsearch/index/fielddata/plain/AbstractBytesIndexFieldData.java new file mode 100644 index 00000000000..d61d99b0032 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/fielddata/plain/AbstractBytesIndexFieldData.java @@ -0,0 +1,172 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.fielddata.plain; + +import java.io.IOException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.lucene.index.AtomicReader; +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.FilteredTermsEnum; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.UnicodeUtil; +import org.elasticsearch.ElasticSearchException; +import org.elasticsearch.common.Nullable; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.fielddata.AbstractIndexFieldData; +import org.elasticsearch.index.fielddata.AtomicFieldData; +import org.elasticsearch.index.fielddata.FieldDataType; +import org.elasticsearch.index.fielddata.IndexFieldData; +import org.elasticsearch.index.fielddata.IndexFieldDataCache; +import org.elasticsearch.index.fielddata.ScriptDocValues; +import org.elasticsearch.index.fielddata.fieldcomparator.BytesRefFieldComparatorSource; +import org.elasticsearch.index.fielddata.fieldcomparator.SortMode; +import org.elasticsearch.index.mapper.FieldMapper.Names; + +public abstract class AbstractBytesIndexFieldData> extends AbstractIndexFieldData implements IndexFieldData.WithOrdinals { + + private Settings frequency; + private Settings regex; + + protected AbstractBytesIndexFieldData(Index index, Settings indexSettings, Names fieldNames, FieldDataType fieldDataType, + IndexFieldDataCache cache) { + super(index, indexSettings, fieldNames, fieldDataType, cache); + frequency = getPrefixSettings(fieldDataType.getSettings(), "filter.frequency."); + regex = getPrefixSettings(fieldDataType.getSettings(), "filter.regex."); + + } + + private final Settings getPrefixSettings(Settings settings, String prefix) { + Settings byPrefix = settings.getByPrefix(prefix); + return byPrefix.getAsMap().isEmpty() ? null : byPrefix; + } + + @Override + public final boolean valuesOrdered() { + return true; + } + + @Override + public FD load(AtomicReaderContext context) { + try { + return cache.load(context, this); + } catch (Throwable e) { + if (e instanceof ElasticSearchException) { + throw (ElasticSearchException) e; + } else { + throw new ElasticSearchException(e.getMessage(), e); + } + } + } + + @Override + public XFieldComparatorSource comparatorSource(@Nullable Object missingValue, SortMode sortMode) { + // TODO support "missingValue" for sortMissingValue options here... + return new BytesRefFieldComparatorSource(this, sortMode); + } + + protected TermsEnum filter(Terms terms, AtomicReader reader) throws IOException { + TermsEnum iterator = terms.iterator(null); + if (iterator == null) { + return null; + } + if (iterator != null && frequency != null) { + iterator = FrequencyFilter.filter(iterator, terms, reader, frequency); + } + + if (iterator != null && regex != null) { + iterator = RegexFilter.filter(iterator, terms, reader, regex); + } + return iterator; + } + + private static final class FrequencyFilter extends FilteredTermsEnum { + + private int minFreq; + private int maxFreq; + public FrequencyFilter(TermsEnum delegate, int minFreq, int maxFreq) { + super(delegate, false); + this.minFreq = minFreq; + this.maxFreq = maxFreq; + } + + public static TermsEnum filter(TermsEnum toFilter, Terms terms, AtomicReader reader, Settings settings) throws IOException { + int docCount = terms.getDocCount(); + if (docCount == -1) { + docCount = reader.maxDoc(); + } + final double minFrequency = settings.getAsDouble("min", 0d); + final double maxFrequency = settings.getAsDouble("max", docCount+1d); + final double minSegmentSize = settings.getAsInt("min_segment_size", 0); + if (minSegmentSize < docCount) { + final int minFreq = minFrequency >= 1.0? (int) minFrequency : (int)(docCount * minFrequency); + final int maxFreq = maxFrequency >= 1.0? (int) maxFrequency : (int)(docCount * maxFrequency); + assert minFreq < maxFreq; + return new FrequencyFilter(toFilter, minFreq, maxFreq); + } + + return toFilter; + + } + + @Override + protected AcceptStatus accept(BytesRef arg0) throws IOException { + int docFreq = docFreq(); + if (docFreq >= minFreq && docFreq <= maxFreq) { + return AcceptStatus.YES; + } + return AcceptStatus.NO; + } + } + + private static final class RegexFilter extends FilteredTermsEnum { + + private final Matcher matcher; + private final CharsRef spare = new CharsRef(); + + public RegexFilter(TermsEnum delegate, Matcher matcher) { + super(delegate, false); + this.matcher = matcher; + } + public static TermsEnum filter(TermsEnum iterator, Terms terms, AtomicReader reader, Settings regex) { + String pattern = regex.get("pattern"); + if (pattern == null) { + return iterator; + } + Pattern p = Pattern.compile(pattern); + return new RegexFilter(iterator, p.matcher("")); + } + + @Override + protected AcceptStatus accept(BytesRef arg0) throws IOException { + UnicodeUtil.UTF8toUTF16(arg0, spare); + matcher.reset(spare); + if (matcher.matches()) { + return AcceptStatus.YES; + } + return AcceptStatus.NO; + } + } + +} diff --git a/src/main/java/org/elasticsearch/index/fielddata/plain/ConcreteBytesRefIndexFieldData.java b/src/main/java/org/elasticsearch/index/fielddata/plain/ConcreteBytesRefIndexFieldData.java index 5176e08b406..d17edf7d812 100644 --- a/src/main/java/org/elasticsearch/index/fielddata/plain/ConcreteBytesRefIndexFieldData.java +++ b/src/main/java/org/elasticsearch/index/fielddata/plain/ConcreteBytesRefIndexFieldData.java @@ -19,35 +19,30 @@ package org.elasticsearch.index.fielddata.plain; +import java.util.ArrayList; + import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.Terms; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefIterator; -import org.elasticsearch.ElasticSearchException; -import org.elasticsearch.common.Nullable; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.Index; -import org.elasticsearch.index.fielddata.AbstractIndexFieldData; import org.elasticsearch.index.fielddata.FieldDataType; import org.elasticsearch.index.fielddata.IndexFieldData; import org.elasticsearch.index.fielddata.IndexFieldDataCache; -import org.elasticsearch.index.fielddata.fieldcomparator.BytesRefFieldComparatorSource; -import org.elasticsearch.index.fielddata.fieldcomparator.SortMode; import org.elasticsearch.index.fielddata.ordinals.OrdinalsBuilder; import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.index.settings.IndexSettings; -import java.util.ArrayList; - /** */ -public class ConcreteBytesRefIndexFieldData extends AbstractIndexFieldData implements IndexFieldData.WithOrdinals { +public class ConcreteBytesRefIndexFieldData extends AbstractBytesIndexFieldData { public static class Builder implements IndexFieldData.Builder { @Override - public IndexFieldData build(Index index, @IndexSettings Settings indexSettings, FieldMapper.Names fieldNames, FieldDataType type, IndexFieldDataCache cache) { + public IndexFieldData build(Index index, @IndexSettings Settings indexSettings, FieldMapper.Names fieldNames, FieldDataType type, IndexFieldDataCache cache) { return new ConcreteBytesRefIndexFieldData(index, indexSettings, fieldNames, type, cache); } } @@ -56,24 +51,6 @@ public class ConcreteBytesRefIndexFieldData extends AbstractIndexFieldData implements IndexFieldData.WithOrdinals { +public class FSTBytesIndexFieldData extends AbstractBytesIndexFieldData { public static class Builder implements IndexFieldData.Builder { @Override - public IndexFieldData build(Index index, @IndexSettings Settings indexSettings, FieldMapper.Names fieldNames, FieldDataType type, IndexFieldDataCache cache) { + public IndexFieldData build(Index index, @IndexSettings Settings indexSettings, FieldMapper.Names fieldNames, FieldDataType type, IndexFieldDataCache cache) { return new FSTBytesIndexFieldData(index, indexSettings, fieldNames, type, cache); } } - public FSTBytesIndexFieldData(Index index, @IndexSettings Settings indexSettings, FieldMapper.Names fieldNames, FieldDataType fieldDataType, IndexFieldDataCache cache) { + FSTBytesIndexFieldData(Index index, @IndexSettings Settings indexSettings, FieldMapper.Names fieldNames, FieldDataType fieldDataType, IndexFieldDataCache cache) { super(index, indexSettings, fieldNames, fieldDataType, cache); } - @Override - public boolean valuesOrdered() { - return true; - } - - @Override - public FSTBytesAtomicFieldData load(AtomicReaderContext context) { - try { - return cache.load(context, this); - } catch (Throwable e) { - if (e instanceof ElasticSearchException) { - throw (ElasticSearchException) e; - } else { - throw new ElasticSearchException(e.getMessage(), e); - } - } - } - @Override public FSTBytesAtomicFieldData loadDirect(AtomicReaderContext context) throws Exception { AtomicReader reader = context.reader(); @@ -96,7 +73,7 @@ public class FSTBytesIndexFieldData extends AbstractIndexFieldData implements IndexFieldData.WithOrdinals { +public class PagedBytesIndexFieldData extends AbstractBytesIndexFieldData { public static class Builder implements IndexFieldData.Builder { @Override - public IndexFieldData build(Index index, @IndexSettings Settings indexSettings, FieldMapper.Names fieldNames, FieldDataType type, IndexFieldDataCache cache) { + public IndexFieldData build(Index index, @IndexSettings Settings indexSettings, FieldMapper.Names fieldNames, FieldDataType type, IndexFieldDataCache cache) { return new PagedBytesIndexFieldData(index, indexSettings, fieldNames, type, cache); } } @@ -56,24 +55,6 @@ public class PagedBytesIndexFieldData extends AbstractIndexFieldData> list = new ArrayList>(typeMap.entrySet()); Preprocessor pre = new Preprocessor(); while (!list.isEmpty()) { diff --git a/src/test/java/org/elasticsearch/test/unit/index/fielddata/FilterFieldDataTest.java b/src/test/java/org/elasticsearch/test/unit/index/fielddata/FilterFieldDataTest.java new file mode 100644 index 00000000000..644eec3cb01 --- /dev/null +++ b/src/test/java/org/elasticsearch/test/unit/index/fielddata/FilterFieldDataTest.java @@ -0,0 +1,209 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.test.unit.index.fielddata; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.equalTo; + +import java.util.Random; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.AtomicReaderContext; +import org.elasticsearch.common.settings.ImmutableSettings; +import org.elasticsearch.index.fielddata.AtomicFieldData; +import org.elasticsearch.index.fielddata.AtomicFieldData.WithOrdinals; +import org.elasticsearch.index.fielddata.BytesValues; +import org.elasticsearch.index.fielddata.FieldDataType; +import org.elasticsearch.index.fielddata.IndexFieldData; +import org.elasticsearch.index.fielddata.ScriptDocValues; +import org.elasticsearch.index.fielddata.ScriptDocValues.Strings; +import org.elasticsearch.index.fielddata.ordinals.Ordinals.Docs; +import org.elasticsearch.index.mapper.FieldMapper; +import org.testng.annotations.Test; + +public class FilterFieldDataTest extends AbstractFieldDataTests { + + @Override + protected FieldDataType getFieldDataType() { + // TODO Auto-generated method stub + return null; + } + + @Test + public void testFilterByFrequency() throws Exception { + long seed = System.currentTimeMillis(); + System.out.println("seed[testFilterByFrequency]: " + seed); + Random random = new Random(seed); + for (int i = 0; i < 1000; i++) { + Document d = new Document(); + d.add(new StringField("id", "" + i, Field.Store.NO)); + if (i % 100 == 0) { + d.add(new StringField("high_freq", "100", Field.Store.NO)); + d.add(new StringField("low_freq", "100", Field.Store.NO)); + d.add(new StringField("med_freq", "100", Field.Store.NO)); + } + if (i % 10 == 0) { + d.add(new StringField("high_freq", "10", Field.Store.NO)); + d.add(new StringField("med_freq", "10", Field.Store.NO)); + } + if (i % 5 == 0) { + d.add(new StringField("high_freq", "5", Field.Store.NO)); + } + writer.addDocument(d); + } + writer.forceMerge(1); + AtomicReaderContext context = refreshReader(); + String[] formats = new String[] { "fst", "paged_bytes", "concrete_bytes" }; + + for (String format : formats) { + { + ifdService.clear(); + FieldDataType fieldDataType = new FieldDataType("string", ImmutableSettings.builder().put("format", format) + .put("filter.frequency.min_segment_size", 100).put("filter.frequency.min", 0.0d).put("filter.frequency.max", random.nextBoolean() ? 100 : 0.5d)); + IndexFieldData fieldData = ifdService.getForField(new FieldMapper.Names("high_freq"), fieldDataType); + AtomicFieldData.WithOrdinals loadDirect = (WithOrdinals) fieldData.loadDirect(context); + BytesValues.WithOrdinals bytesValues = loadDirect.getBytesValues(); + Docs ordinals = bytesValues.ordinals(); + assertThat(2, equalTo(ordinals.getNumOrds())); + assertThat(1000, equalTo(ordinals.getNumDocs())); + assertThat(bytesValues.getValueByOrd(1).utf8ToString(), equalTo("10")); + assertThat(bytesValues.getValueByOrd(2).utf8ToString(), equalTo("100")); + } + { + ifdService.clear(); + FieldDataType fieldDataType = new FieldDataType("string", ImmutableSettings.builder().put("format", format) + .put("filter.frequency.min_segment_size", 100).put("filter.frequency.min", random.nextBoolean() ? 101 : 101d/200.0d).put("filter.frequency.max", 201)); + IndexFieldData fieldData = ifdService.getForField(new FieldMapper.Names("high_freq"), fieldDataType); + AtomicFieldData.WithOrdinals loadDirect = (WithOrdinals) fieldData.loadDirect(context); + BytesValues.WithOrdinals bytesValues = loadDirect.getBytesValues(); + Docs ordinals = bytesValues.ordinals(); + assertThat(1, equalTo(ordinals.getNumOrds())); + assertThat(1000, equalTo(ordinals.getNumDocs())); + assertThat(bytesValues.getValueByOrd(1).utf8ToString(), equalTo("5")); + } + + { + ifdService.clear(); // test # docs with value + FieldDataType fieldDataType = new FieldDataType("string", ImmutableSettings.builder().put("format", format) + .put("filter.frequency.min_segment_size", 101).put("filter.frequency.min", random.nextBoolean() ? 101 : 101d/200.0d)); + IndexFieldData fieldData = ifdService.getForField(new FieldMapper.Names("med_freq"), fieldDataType); + AtomicFieldData.WithOrdinals loadDirect = (WithOrdinals) fieldData.loadDirect(context); + BytesValues.WithOrdinals bytesValues = loadDirect.getBytesValues(); + Docs ordinals = bytesValues.ordinals(); + assertThat(2, equalTo(ordinals.getNumOrds())); + assertThat(1000, equalTo(ordinals.getNumDocs())); + assertThat(bytesValues.getValueByOrd(1).utf8ToString(), equalTo("10")); + assertThat(bytesValues.getValueByOrd(2).utf8ToString(), equalTo("100")); + } + + { + ifdService.clear(); + FieldDataType fieldDataType = new FieldDataType("string", ImmutableSettings.builder().put("format", format) + .put("filter.frequency.min_segment_size", 101).put("filter.frequency.min", random.nextBoolean() ? 101 : 101d/200.0d)); + IndexFieldData fieldData = ifdService.getForField(new FieldMapper.Names("med_freq"), fieldDataType); + AtomicFieldData.WithOrdinals loadDirect = (WithOrdinals) fieldData.loadDirect(context); + BytesValues.WithOrdinals bytesValues = loadDirect.getBytesValues(); + Docs ordinals = bytesValues.ordinals(); + assertThat(2, equalTo(ordinals.getNumOrds())); + assertThat(1000, equalTo(ordinals.getNumDocs())); + assertThat(bytesValues.getValueByOrd(1).utf8ToString(), equalTo("10")); + assertThat(bytesValues.getValueByOrd(2).utf8ToString(), equalTo("100")); + } + + { + ifdService.clear(); + FieldDataType fieldDataType = new FieldDataType("string", ImmutableSettings.builder().put("format", format) + .put("filter.regex.pattern", "\\d{2,3}") // allows 10 & 100 + .put("filter.frequency.min_segment_size", 0) + .put("filter.frequency.min", random.nextBoolean() ? 1 : 1d/200.0d) // 100, 10, 5 + .put("filter.frequency.max", random.nextBoolean() ? 99 : 99d/200.0d)); // 100 + IndexFieldData fieldData = ifdService.getForField(new FieldMapper.Names("high_freq"), fieldDataType); + AtomicFieldData.WithOrdinals loadDirect = (WithOrdinals) fieldData.loadDirect(context); + BytesValues.WithOrdinals bytesValues = loadDirect.getBytesValues(); + Docs ordinals = bytesValues.ordinals(); + assertThat(1, equalTo(ordinals.getNumOrds())); + assertThat(1000, equalTo(ordinals.getNumDocs())); + assertThat(bytesValues.getValueByOrd(1).utf8ToString(), equalTo("100")); + } + } + + } + + @Test + public void testFilterByRegExp() throws Exception { + + int hundred = 0; + int ten = 0; + int five = 0; + for (int i = 0; i < 1000; i++) { + Document d = new Document(); + d.add(new StringField("id", "" + i, Field.Store.NO)); + if (i % 100 == 0) { + hundred++; + d.add(new StringField("high_freq", "100", Field.Store.NO)); + } + if (i % 10 == 0) { + ten++; + d.add(new StringField("high_freq", "10", Field.Store.NO)); + } + if (i % 5 == 0) { + five++; + d.add(new StringField("high_freq", "5", Field.Store.NO)); + + } + writer.addDocument(d); + } + System.out.println(hundred + " " + ten + " " +five); + writer.forceMerge(1); + AtomicReaderContext context = refreshReader(); + String[] formats = new String[] { "fst", "paged_bytes", "concrete_bytes" }; + for (String format : formats) { + { + ifdService.clear(); + FieldDataType fieldDataType = new FieldDataType("string", ImmutableSettings.builder().put("format", format) + .put("filter.regex.pattern", "\\d")); + IndexFieldData fieldData = ifdService.getForField(new FieldMapper.Names("high_freq"), fieldDataType); + AtomicFieldData.WithOrdinals loadDirect = (WithOrdinals) fieldData.loadDirect(context); + BytesValues.WithOrdinals bytesValues = loadDirect.getBytesValues(); + Docs ordinals = bytesValues.ordinals(); + assertThat(1, equalTo(ordinals.getNumOrds())); + assertThat(1000, equalTo(ordinals.getNumDocs())); + assertThat(bytesValues.getValueByOrd(1).utf8ToString(), equalTo("5")); + } + { + ifdService.clear(); + FieldDataType fieldDataType = new FieldDataType("string", ImmutableSettings.builder().put("format", format) + .put("filter.regex.pattern", "\\d{1,2}")); + IndexFieldData fieldData = ifdService.getForField(new FieldMapper.Names("high_freq"), fieldDataType); + AtomicFieldData.WithOrdinals loadDirect = (WithOrdinals) fieldData.loadDirect(context); + BytesValues.WithOrdinals bytesValues = loadDirect.getBytesValues(); + Docs ordinals = bytesValues.ordinals(); + assertThat(2, equalTo(ordinals.getNumOrds())); + assertThat(1000, equalTo(ordinals.getNumDocs())); + assertThat(bytesValues.getValueByOrd(1).utf8ToString(), equalTo("10")); + assertThat(bytesValues.getValueByOrd(2).utf8ToString(), equalTo("5")); + } + } + + } + + +}