# FieldData Filter

FieldData is an in-memory representation of the term dictionary in an uninverted form. Under certain circumstances this FieldData representation can grow very large on high-cardinality fields like tokenized full-text. Depending on the use-case filtering the terms that are hold in the FieldData representation can heavily improve execution performance and application stability.
FieldData Filters can be applied on a per-segment basis. During FieldData loading the terms enumeration is passed through a filter predicate that  either accepts or rejects a term.

## Frequency Filter

The Frequency Filter acts as a high / low pass filter based on the document frequencies of a certain term within the segment that is loaded into field data. It allows to reject terms that are very high or low frequent based on absolute frequencies or percentages relative to the number of documents in the segment or more precise the number of document that have at least one value in the field that is loaded in the current segment.

Here is an example mapping

Here is an example mapping:

```json
{
    "tweet" : {
        "properties" : {
            "locale" : {
                "type" : "string",
                "fielddata" : "format=paged_bytes;filter.frequency.min=0.001;filter.frequency.max=0.1",
                "index" : "analyzed",
            }
        }
    }
}
```
### Paramters

 * `filter.frequency.min` - the minimum document frequency (inclusive) in order to be loaded in to memory. Either a percentage if < `1.0` or an absolute value. `0` if omitted.
 * `filter.frequency.max` - the maximum document frequency (inclusive) in order to be loaded in to memory. Either a percentage if < `1.0` or an absolute value. `0` if omitted.
 * `filter.frequency.min_segment_size` - the minimum number of documents in a segment in order for the filter to be applied. Small segments might be omitted with this setting.

## Regular Expression Filter

The regular expression filter applies a regular expression to each term  during loading and only loads terms into memory that match the given regular expression.

Here is an example mapping:

```json
{
    "tweet" : {
        "properties" : {
            "locale" : {
                "type" : "string",
                "fielddata" : "format=paged_bytes;filter.regex=^en_.*",
                "index" : "analyzed",
            }
        }
    }
}
```

Closes #2874
This commit is contained in:
Simon Willnauer 2013-04-08 18:12:17 +02:00
parent acc0950957
commit 374bbbfa7b
7 changed files with 413 additions and 101 deletions

View File

@ -0,0 +1,172 @@
/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.fielddata.plain;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.elasticsearch.ElasticSearchException;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.fielddata.AbstractIndexFieldData;
import org.elasticsearch.index.fielddata.AtomicFieldData;
import org.elasticsearch.index.fielddata.FieldDataType;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.IndexFieldDataCache;
import org.elasticsearch.index.fielddata.ScriptDocValues;
import org.elasticsearch.index.fielddata.fieldcomparator.BytesRefFieldComparatorSource;
import org.elasticsearch.index.fielddata.fieldcomparator.SortMode;
import org.elasticsearch.index.mapper.FieldMapper.Names;
public abstract class AbstractBytesIndexFieldData<FD extends AtomicFieldData.WithOrdinals<ScriptDocValues.Strings>> extends AbstractIndexFieldData<FD> implements IndexFieldData.WithOrdinals<FD> {
private Settings frequency;
private Settings regex;
protected AbstractBytesIndexFieldData(Index index, Settings indexSettings, Names fieldNames, FieldDataType fieldDataType,
IndexFieldDataCache cache) {
super(index, indexSettings, fieldNames, fieldDataType, cache);
frequency = getPrefixSettings(fieldDataType.getSettings(), "filter.frequency.");
regex = getPrefixSettings(fieldDataType.getSettings(), "filter.regex.");
}
private final Settings getPrefixSettings(Settings settings, String prefix) {
Settings byPrefix = settings.getByPrefix(prefix);
return byPrefix.getAsMap().isEmpty() ? null : byPrefix;
}
@Override
public final boolean valuesOrdered() {
return true;
}
@Override
public FD load(AtomicReaderContext context) {
try {
return cache.load(context, this);
} catch (Throwable e) {
if (e instanceof ElasticSearchException) {
throw (ElasticSearchException) e;
} else {
throw new ElasticSearchException(e.getMessage(), e);
}
}
}
@Override
public XFieldComparatorSource comparatorSource(@Nullable Object missingValue, SortMode sortMode) {
// TODO support "missingValue" for sortMissingValue options here...
return new BytesRefFieldComparatorSource(this, sortMode);
}
protected TermsEnum filter(Terms terms, AtomicReader reader) throws IOException {
TermsEnum iterator = terms.iterator(null);
if (iterator == null) {
return null;
}
if (iterator != null && frequency != null) {
iterator = FrequencyFilter.filter(iterator, terms, reader, frequency);
}
if (iterator != null && regex != null) {
iterator = RegexFilter.filter(iterator, terms, reader, regex);
}
return iterator;
}
private static final class FrequencyFilter extends FilteredTermsEnum {
private int minFreq;
private int maxFreq;
public FrequencyFilter(TermsEnum delegate, int minFreq, int maxFreq) {
super(delegate, false);
this.minFreq = minFreq;
this.maxFreq = maxFreq;
}
public static TermsEnum filter(TermsEnum toFilter, Terms terms, AtomicReader reader, Settings settings) throws IOException {
int docCount = terms.getDocCount();
if (docCount == -1) {
docCount = reader.maxDoc();
}
final double minFrequency = settings.getAsDouble("min", 0d);
final double maxFrequency = settings.getAsDouble("max", docCount+1d);
final double minSegmentSize = settings.getAsInt("min_segment_size", 0);
if (minSegmentSize < docCount) {
final int minFreq = minFrequency >= 1.0? (int) minFrequency : (int)(docCount * minFrequency);
final int maxFreq = maxFrequency >= 1.0? (int) maxFrequency : (int)(docCount * maxFrequency);
assert minFreq < maxFreq;
return new FrequencyFilter(toFilter, minFreq, maxFreq);
}
return toFilter;
}
@Override
protected AcceptStatus accept(BytesRef arg0) throws IOException {
int docFreq = docFreq();
if (docFreq >= minFreq && docFreq <= maxFreq) {
return AcceptStatus.YES;
}
return AcceptStatus.NO;
}
}
private static final class RegexFilter extends FilteredTermsEnum {
private final Matcher matcher;
private final CharsRef spare = new CharsRef();
public RegexFilter(TermsEnum delegate, Matcher matcher) {
super(delegate, false);
this.matcher = matcher;
}
public static TermsEnum filter(TermsEnum iterator, Terms terms, AtomicReader reader, Settings regex) {
String pattern = regex.get("pattern");
if (pattern == null) {
return iterator;
}
Pattern p = Pattern.compile(pattern);
return new RegexFilter(iterator, p.matcher(""));
}
@Override
protected AcceptStatus accept(BytesRef arg0) throws IOException {
UnicodeUtil.UTF8toUTF16(arg0, spare);
matcher.reset(spare);
if (matcher.matches()) {
return AcceptStatus.YES;
}
return AcceptStatus.NO;
}
}
}

View File

@ -19,35 +19,30 @@
package org.elasticsearch.index.fielddata.plain;
import java.util.ArrayList;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.Terms;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.elasticsearch.ElasticSearchException;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.fielddata.AbstractIndexFieldData;
import org.elasticsearch.index.fielddata.FieldDataType;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.IndexFieldDataCache;
import org.elasticsearch.index.fielddata.fieldcomparator.BytesRefFieldComparatorSource;
import org.elasticsearch.index.fielddata.fieldcomparator.SortMode;
import org.elasticsearch.index.fielddata.ordinals.OrdinalsBuilder;
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.index.settings.IndexSettings;
import java.util.ArrayList;
/**
*/
public class ConcreteBytesRefIndexFieldData extends AbstractIndexFieldData<ConcreteBytesRefAtomicFieldData> implements IndexFieldData.WithOrdinals<ConcreteBytesRefAtomicFieldData> {
public class ConcreteBytesRefIndexFieldData extends AbstractBytesIndexFieldData<ConcreteBytesRefAtomicFieldData> {
public static class Builder implements IndexFieldData.Builder {
@Override
public IndexFieldData build(Index index, @IndexSettings Settings indexSettings, FieldMapper.Names fieldNames, FieldDataType type, IndexFieldDataCache cache) {
public IndexFieldData<ConcreteBytesRefAtomicFieldData> build(Index index, @IndexSettings Settings indexSettings, FieldMapper.Names fieldNames, FieldDataType type, IndexFieldDataCache cache) {
return new ConcreteBytesRefIndexFieldData(index, indexSettings, fieldNames, type, cache);
}
}
@ -56,24 +51,6 @@ public class ConcreteBytesRefIndexFieldData extends AbstractIndexFieldData<Concr
super(index, indexSettings, fieldNames, fieldDataType, cache);
}
@Override
public boolean valuesOrdered() {
return true;
}
@Override
public ConcreteBytesRefAtomicFieldData load(AtomicReaderContext context) {
try {
return cache.load(context, this);
} catch (Throwable e) {
if (e instanceof ElasticSearchException) {
throw (ElasticSearchException) e;
} else {
throw new ElasticSearchException(e.getMessage(), e);
}
}
}
@Override
public ConcreteBytesRefAtomicFieldData loadDirect(AtomicReaderContext context) throws Exception {
AtomicReader reader = context.reader();
@ -91,7 +68,7 @@ public class ConcreteBytesRefIndexFieldData extends AbstractIndexFieldData<Concr
values.add(null); // first "t" indicates null value
OrdinalsBuilder builder = new OrdinalsBuilder(terms, reader.maxDoc());
try {
BytesRefIterator iter = builder.buildFromTerms(terms.iterator(null), reader.getLiveDocs());
BytesRefIterator iter = builder.buildFromTerms(filter(terms, reader), reader.getLiveDocs());
BytesRef term;
while ((term = iter.next()) != null) {
values.add(BytesRef.deepCopyOf(term));
@ -101,10 +78,4 @@ public class ConcreteBytesRefIndexFieldData extends AbstractIndexFieldData<Concr
builder.close();
}
}
@Override
public XFieldComparatorSource comparatorSource(@Nullable Object missingValue, SortMode sortMode) {
// TODO support "missingValue" for sortMissingValue options here...
return new BytesRefFieldComparatorSource(this, sortMode);
}
}

View File

@ -30,16 +30,11 @@ import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.INPUT_TYPE;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
import org.elasticsearch.ElasticSearchException;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.fielddata.AbstractIndexFieldData;
import org.elasticsearch.index.fielddata.FieldDataType;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.IndexFieldDataCache;
import org.elasticsearch.index.fielddata.fieldcomparator.BytesRefFieldComparatorSource;
import org.elasticsearch.index.fielddata.fieldcomparator.SortMode;
import org.elasticsearch.index.fielddata.ordinals.Ordinals;
import org.elasticsearch.index.fielddata.ordinals.OrdinalsBuilder;
import org.elasticsearch.index.mapper.FieldMapper;
@ -47,38 +42,20 @@ import org.elasticsearch.index.settings.IndexSettings;
/**
*/
public class FSTBytesIndexFieldData extends AbstractIndexFieldData<FSTBytesAtomicFieldData> implements IndexFieldData.WithOrdinals<FSTBytesAtomicFieldData> {
public class FSTBytesIndexFieldData extends AbstractBytesIndexFieldData<FSTBytesAtomicFieldData> {
public static class Builder implements IndexFieldData.Builder {
@Override
public IndexFieldData build(Index index, @IndexSettings Settings indexSettings, FieldMapper.Names fieldNames, FieldDataType type, IndexFieldDataCache cache) {
public IndexFieldData<FSTBytesAtomicFieldData> build(Index index, @IndexSettings Settings indexSettings, FieldMapper.Names fieldNames, FieldDataType type, IndexFieldDataCache cache) {
return new FSTBytesIndexFieldData(index, indexSettings, fieldNames, type, cache);
}
}
public FSTBytesIndexFieldData(Index index, @IndexSettings Settings indexSettings, FieldMapper.Names fieldNames, FieldDataType fieldDataType, IndexFieldDataCache cache) {
FSTBytesIndexFieldData(Index index, @IndexSettings Settings indexSettings, FieldMapper.Names fieldNames, FieldDataType fieldDataType, IndexFieldDataCache cache) {
super(index, indexSettings, fieldNames, fieldDataType, cache);
}
@Override
public boolean valuesOrdered() {
return true;
}
@Override
public FSTBytesAtomicFieldData load(AtomicReaderContext context) {
try {
return cache.load(context, this);
} catch (Throwable e) {
if (e instanceof ElasticSearchException) {
throw (ElasticSearchException) e;
} else {
throw new ElasticSearchException(e.getMessage(), e);
}
}
}
@Override
public FSTBytesAtomicFieldData loadDirect(AtomicReaderContext context) throws Exception {
AtomicReader reader = context.reader();
@ -96,7 +73,7 @@ public class FSTBytesIndexFieldData extends AbstractIndexFieldData<FSTBytesAtomi
// 0 is reserved for "unset"
fstBuilder.add(Util.toIntsRef(new BytesRef(), scratch), 0l);
TermsEnum termsEnum = terms.iterator(null);
TermsEnum termsEnum = filter(terms, reader);
DocsEnum docsEnum = null;
for (BytesRef term = termsEnum.next(); term != null; term = termsEnum.next()) {
final int termOrd = builder.nextOrdinal();
@ -116,10 +93,4 @@ public class FSTBytesIndexFieldData extends AbstractIndexFieldData<FSTBytesAtomi
builder.close();
}
}
@Override
public XFieldComparatorSource comparatorSource(@Nullable Object missingValue, SortMode sortMode) {
// TODO support "missingValue" for sortMissingValue options here...
return new BytesRefFieldComparatorSource(this, sortMode);
}
}

View File

@ -19,22 +19,21 @@
package org.elasticsearch.index.fielddata.plain;
import org.apache.lucene.index.*;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PagedBytes;
import org.apache.lucene.util.packed.GrowableWriter;
import org.apache.lucene.util.packed.PackedInts;
import org.elasticsearch.ElasticSearchException;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.fielddata.AbstractIndexFieldData;
import org.elasticsearch.index.fielddata.FieldDataType;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.IndexFieldDataCache;
import org.elasticsearch.index.fielddata.fieldcomparator.BytesRefFieldComparatorSource;
import org.elasticsearch.index.fielddata.fieldcomparator.SortMode;
import org.elasticsearch.index.fielddata.ordinals.Ordinals;
import org.elasticsearch.index.fielddata.ordinals.OrdinalsBuilder;
import org.elasticsearch.index.mapper.FieldMapper;
@ -42,12 +41,12 @@ import org.elasticsearch.index.settings.IndexSettings;
/**
*/
public class PagedBytesIndexFieldData extends AbstractIndexFieldData<PagedBytesAtomicFieldData> implements IndexFieldData.WithOrdinals<PagedBytesAtomicFieldData> {
public class PagedBytesIndexFieldData extends AbstractBytesIndexFieldData<PagedBytesAtomicFieldData> {
public static class Builder implements IndexFieldData.Builder {
@Override
public IndexFieldData build(Index index, @IndexSettings Settings indexSettings, FieldMapper.Names fieldNames, FieldDataType type, IndexFieldDataCache cache) {
public IndexFieldData<PagedBytesAtomicFieldData> build(Index index, @IndexSettings Settings indexSettings, FieldMapper.Names fieldNames, FieldDataType type, IndexFieldDataCache cache) {
return new PagedBytesIndexFieldData(index, indexSettings, fieldNames, type, cache);
}
}
@ -56,24 +55,6 @@ public class PagedBytesIndexFieldData extends AbstractIndexFieldData<PagedBytesA
super(index, indexSettings, fieldNames, fieldDataType, cache);
}
@Override
public boolean valuesOrdered() {
return true;
}
@Override
public PagedBytesAtomicFieldData load(AtomicReaderContext context) {
try {
return cache.load(context, this);
} catch (Throwable e) {
if (e instanceof ElasticSearchException) {
throw (ElasticSearchException) e;
} else {
throw new ElasticSearchException(e.getMessage(), e);
}
}
}
@Override
public PagedBytesAtomicFieldData loadDirect(AtomicReaderContext context) throws Exception {
AtomicReader reader = context.reader();
@ -123,7 +104,7 @@ public class PagedBytesIndexFieldData extends AbstractIndexFieldData<PagedBytesA
try {
// 0 is reserved for "unset"
bytes.copyUsingLengthPrefix(new BytesRef());
TermsEnum termsEnum = terms.iterator(null);
TermsEnum termsEnum = filter(terms, reader);
DocsEnum docsEnum = null;
for (BytesRef term = termsEnum.next(); term != null; term = termsEnum.next()) {
final int termOrd = builder.nextOrdinal();
@ -149,10 +130,4 @@ public class PagedBytesIndexFieldData extends AbstractIndexFieldData<PagedBytesA
builder.close();
}
}
@Override
public XFieldComparatorSource comparatorSource(@Nullable Object missingValue, SortMode sortMode) {
// TODO support "missingValue" for sortMissingValue options here...
return new BytesRefFieldComparatorSource(this, sortMode);
}
}

View File

@ -497,6 +497,14 @@ public class SimpleFacetsTests extends AbstractNodesTests {
.field("type", "string")
.field("fielddata","format=fst")
.endObject()
.startObject("filtered")
.field("type", "string")
.field("fielddata","format=fst;filter.regex.pattern=\\d{1,2}") // only 1 or 2 digits
.endObject()
.startObject("filtered_mv")
.field("type", "string")
.field("fielddata","format=fst;filter.regex.pattern=\\d{1,2}") // only 1 or 2 digits
.endObject()
.endObject().endObject().endObject())
.execute().actionGet();
client.admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).setWaitForGreenStatus().execute().actionGet();
@ -506,9 +514,11 @@ public class SimpleFacetsTests extends AbstractNodesTests {
.field("name_concrete", ""+i)
.field("name_paged", ""+i)
.field("name_fst", ""+i)
.field("filtered", ""+i)
.field("name_concrete_mv", ""+i, ""+Math.min(99, i+1))
.field("name_paged_mv", ""+i,""+ Math.min(99, i+1))
.field("name_fst_mv", ""+i,""+Math.min(99, i+1))
.field("filtered_mv", ""+i,""+Math.min(99, i+1), ""+(100 + i))
.endObject()).execute().actionGet();
}
@ -559,7 +569,10 @@ public class SimpleFacetsTests extends AbstractNodesTests {
final SearchRequestBuilder facetRequest;
int incrementAndGet = count.incrementAndGet();
final String field;
switch (incrementAndGet % 2) {
switch (incrementAndGet % 3) {
case 2:
field = "filtered"+postfix;
break;
case 1:
field = "name_concrete"+postfix;
break;

View File

@ -266,6 +266,7 @@ public class DuellFieldDataTest extends AbstractFieldDataTests {
typeMap.put(new FieldDataType("string", ImmutableSettings.builder().put("format", "fst")), Type.Bytes);
typeMap.put(new FieldDataType("string", ImmutableSettings.builder().put("format", "paged_bytes")), Type.Bytes);
typeMap.put(new FieldDataType("string", ImmutableSettings.builder().put("format", "concrete_bytes")), Type.Bytes);
// TODO add filters
ArrayList<Entry<FieldDataType, Type>> list = new ArrayList<Entry<FieldDataType, Type>>(typeMap.entrySet());
Preprocessor pre = new Preprocessor();
while (!list.isEmpty()) {

View File

@ -0,0 +1,209 @@
/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.test.unit.index.fielddata;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.equalTo;
import java.util.Random;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.AtomicReaderContext;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.index.fielddata.AtomicFieldData;
import org.elasticsearch.index.fielddata.AtomicFieldData.WithOrdinals;
import org.elasticsearch.index.fielddata.BytesValues;
import org.elasticsearch.index.fielddata.FieldDataType;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.ScriptDocValues;
import org.elasticsearch.index.fielddata.ScriptDocValues.Strings;
import org.elasticsearch.index.fielddata.ordinals.Ordinals.Docs;
import org.elasticsearch.index.mapper.FieldMapper;
import org.testng.annotations.Test;
public class FilterFieldDataTest extends AbstractFieldDataTests {
@Override
protected FieldDataType getFieldDataType() {
// TODO Auto-generated method stub
return null;
}
@Test
public void testFilterByFrequency() throws Exception {
long seed = System.currentTimeMillis();
System.out.println("seed[testFilterByFrequency]: " + seed);
Random random = new Random(seed);
for (int i = 0; i < 1000; i++) {
Document d = new Document();
d.add(new StringField("id", "" + i, Field.Store.NO));
if (i % 100 == 0) {
d.add(new StringField("high_freq", "100", Field.Store.NO));
d.add(new StringField("low_freq", "100", Field.Store.NO));
d.add(new StringField("med_freq", "100", Field.Store.NO));
}
if (i % 10 == 0) {
d.add(new StringField("high_freq", "10", Field.Store.NO));
d.add(new StringField("med_freq", "10", Field.Store.NO));
}
if (i % 5 == 0) {
d.add(new StringField("high_freq", "5", Field.Store.NO));
}
writer.addDocument(d);
}
writer.forceMerge(1);
AtomicReaderContext context = refreshReader();
String[] formats = new String[] { "fst", "paged_bytes", "concrete_bytes" };
for (String format : formats) {
{
ifdService.clear();
FieldDataType fieldDataType = new FieldDataType("string", ImmutableSettings.builder().put("format", format)
.put("filter.frequency.min_segment_size", 100).put("filter.frequency.min", 0.0d).put("filter.frequency.max", random.nextBoolean() ? 100 : 0.5d));
IndexFieldData fieldData = ifdService.getForField(new FieldMapper.Names("high_freq"), fieldDataType);
AtomicFieldData.WithOrdinals<ScriptDocValues.Strings> loadDirect = (WithOrdinals<Strings>) fieldData.loadDirect(context);
BytesValues.WithOrdinals bytesValues = loadDirect.getBytesValues();
Docs ordinals = bytesValues.ordinals();
assertThat(2, equalTo(ordinals.getNumOrds()));
assertThat(1000, equalTo(ordinals.getNumDocs()));
assertThat(bytesValues.getValueByOrd(1).utf8ToString(), equalTo("10"));
assertThat(bytesValues.getValueByOrd(2).utf8ToString(), equalTo("100"));
}
{
ifdService.clear();
FieldDataType fieldDataType = new FieldDataType("string", ImmutableSettings.builder().put("format", format)
.put("filter.frequency.min_segment_size", 100).put("filter.frequency.min", random.nextBoolean() ? 101 : 101d/200.0d).put("filter.frequency.max", 201));
IndexFieldData fieldData = ifdService.getForField(new FieldMapper.Names("high_freq"), fieldDataType);
AtomicFieldData.WithOrdinals<ScriptDocValues.Strings> loadDirect = (WithOrdinals<Strings>) fieldData.loadDirect(context);
BytesValues.WithOrdinals bytesValues = loadDirect.getBytesValues();
Docs ordinals = bytesValues.ordinals();
assertThat(1, equalTo(ordinals.getNumOrds()));
assertThat(1000, equalTo(ordinals.getNumDocs()));
assertThat(bytesValues.getValueByOrd(1).utf8ToString(), equalTo("5"));
}
{
ifdService.clear(); // test # docs with value
FieldDataType fieldDataType = new FieldDataType("string", ImmutableSettings.builder().put("format", format)
.put("filter.frequency.min_segment_size", 101).put("filter.frequency.min", random.nextBoolean() ? 101 : 101d/200.0d));
IndexFieldData fieldData = ifdService.getForField(new FieldMapper.Names("med_freq"), fieldDataType);
AtomicFieldData.WithOrdinals<ScriptDocValues.Strings> loadDirect = (WithOrdinals<Strings>) fieldData.loadDirect(context);
BytesValues.WithOrdinals bytesValues = loadDirect.getBytesValues();
Docs ordinals = bytesValues.ordinals();
assertThat(2, equalTo(ordinals.getNumOrds()));
assertThat(1000, equalTo(ordinals.getNumDocs()));
assertThat(bytesValues.getValueByOrd(1).utf8ToString(), equalTo("10"));
assertThat(bytesValues.getValueByOrd(2).utf8ToString(), equalTo("100"));
}
{
ifdService.clear();
FieldDataType fieldDataType = new FieldDataType("string", ImmutableSettings.builder().put("format", format)
.put("filter.frequency.min_segment_size", 101).put("filter.frequency.min", random.nextBoolean() ? 101 : 101d/200.0d));
IndexFieldData fieldData = ifdService.getForField(new FieldMapper.Names("med_freq"), fieldDataType);
AtomicFieldData.WithOrdinals<ScriptDocValues.Strings> loadDirect = (WithOrdinals<Strings>) fieldData.loadDirect(context);
BytesValues.WithOrdinals bytesValues = loadDirect.getBytesValues();
Docs ordinals = bytesValues.ordinals();
assertThat(2, equalTo(ordinals.getNumOrds()));
assertThat(1000, equalTo(ordinals.getNumDocs()));
assertThat(bytesValues.getValueByOrd(1).utf8ToString(), equalTo("10"));
assertThat(bytesValues.getValueByOrd(2).utf8ToString(), equalTo("100"));
}
{
ifdService.clear();
FieldDataType fieldDataType = new FieldDataType("string", ImmutableSettings.builder().put("format", format)
.put("filter.regex.pattern", "\\d{2,3}") // allows 10 & 100
.put("filter.frequency.min_segment_size", 0)
.put("filter.frequency.min", random.nextBoolean() ? 1 : 1d/200.0d) // 100, 10, 5
.put("filter.frequency.max", random.nextBoolean() ? 99 : 99d/200.0d)); // 100
IndexFieldData fieldData = ifdService.getForField(new FieldMapper.Names("high_freq"), fieldDataType);
AtomicFieldData.WithOrdinals<ScriptDocValues.Strings> loadDirect = (WithOrdinals<Strings>) fieldData.loadDirect(context);
BytesValues.WithOrdinals bytesValues = loadDirect.getBytesValues();
Docs ordinals = bytesValues.ordinals();
assertThat(1, equalTo(ordinals.getNumOrds()));
assertThat(1000, equalTo(ordinals.getNumDocs()));
assertThat(bytesValues.getValueByOrd(1).utf8ToString(), equalTo("100"));
}
}
}
@Test
public void testFilterByRegExp() throws Exception {
int hundred = 0;
int ten = 0;
int five = 0;
for (int i = 0; i < 1000; i++) {
Document d = new Document();
d.add(new StringField("id", "" + i, Field.Store.NO));
if (i % 100 == 0) {
hundred++;
d.add(new StringField("high_freq", "100", Field.Store.NO));
}
if (i % 10 == 0) {
ten++;
d.add(new StringField("high_freq", "10", Field.Store.NO));
}
if (i % 5 == 0) {
five++;
d.add(new StringField("high_freq", "5", Field.Store.NO));
}
writer.addDocument(d);
}
System.out.println(hundred + " " + ten + " " +five);
writer.forceMerge(1);
AtomicReaderContext context = refreshReader();
String[] formats = new String[] { "fst", "paged_bytes", "concrete_bytes" };
for (String format : formats) {
{
ifdService.clear();
FieldDataType fieldDataType = new FieldDataType("string", ImmutableSettings.builder().put("format", format)
.put("filter.regex.pattern", "\\d"));
IndexFieldData fieldData = ifdService.getForField(new FieldMapper.Names("high_freq"), fieldDataType);
AtomicFieldData.WithOrdinals<ScriptDocValues.Strings> loadDirect = (WithOrdinals<Strings>) fieldData.loadDirect(context);
BytesValues.WithOrdinals bytesValues = loadDirect.getBytesValues();
Docs ordinals = bytesValues.ordinals();
assertThat(1, equalTo(ordinals.getNumOrds()));
assertThat(1000, equalTo(ordinals.getNumDocs()));
assertThat(bytesValues.getValueByOrd(1).utf8ToString(), equalTo("5"));
}
{
ifdService.clear();
FieldDataType fieldDataType = new FieldDataType("string", ImmutableSettings.builder().put("format", format)
.put("filter.regex.pattern", "\\d{1,2}"));
IndexFieldData fieldData = ifdService.getForField(new FieldMapper.Names("high_freq"), fieldDataType);
AtomicFieldData.WithOrdinals<ScriptDocValues.Strings> loadDirect = (WithOrdinals<Strings>) fieldData.loadDirect(context);
BytesValues.WithOrdinals bytesValues = loadDirect.getBytesValues();
Docs ordinals = bytesValues.ordinals();
assertThat(2, equalTo(ordinals.getNumOrds()));
assertThat(1000, equalTo(ordinals.getNumDocs()));
assertThat(bytesValues.getValueByOrd(1).utf8ToString(), equalTo("10"));
assertThat(bytesValues.getValueByOrd(2).utf8ToString(), equalTo("5"));
}
}
}
}