From a872b82402401935f9c828c6305cf02b848c532f Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Sun, 17 Nov 2013 21:57:42 +0000 Subject: [PATCH] LUCENE-5339: remove delim char git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5339@1542843 13f79535-47bb-0310-9956-ffa450edef68 --- TODO | 14 ++-- .../apache/lucene/facet/simple/Constants.java | 27 ------- .../lucene/facet/simple/FacetIndexWriter.java | 78 +++++++++++++++++-- .../lucene/facet/simple/FacetsConfig.java | 1 - .../facet/simple/SimpleDrillDownQuery.java | 13 ++-- .../simple/SortedSetDocValuesFacetCounts.java | 2 +- .../simple/SortedSetDocValuesReaderState.java | 14 +--- .../lucene/facet/taxonomy/FacetLabel.java | 1 + 8 files changed, 86 insertions(+), 64 deletions(-) delete mode 100644 lucene/facet/src/java/org/apache/lucene/facet/simple/Constants.java diff --git a/TODO b/TODO index c2422db4364..9034b0c4127 100644 --- a/TODO +++ b/TODO @@ -2,10 +2,9 @@ nocommit this! TODO - associations - - nuke delimChar? can we escape instead? - - wrap an IW instead of extending one? + - cutover taxo writer/reader to pathToString/stringToPath + - wrap an IW instead of extending one? or, FacetDocument? - re-enable ALL_BUT_DIM somehow? - - SSDVValueSourceFacets? - we could put more stuff into the "schema", e.g. this field is sorted-set-DV and that one is taxo? - standardize on facet or facets (e.g. FacetIndexWriter) @@ -13,9 +12,8 @@ TODO - how to do avg() agg? - test needsScores=true / valuesource associations - make FieldTypes optional (if all your dims are flat)? - - add hierarchy to ssdv facets? - - sparse faceting: allow skipping of certain dims? - - maybe an interface/abstract class for "FacetResults"? has common - API, ie to get top facets under a path, get all dims; then DS can - use this? - consistently name things "dimension"? calling these fields is CONFUSING + - later + - SSDVValueSourceFacets? + - add hierarchy to ssdv facets? + - sparse faceting: allow skipping of certain dims? diff --git a/lucene/facet/src/java/org/apache/lucene/facet/simple/Constants.java b/lucene/facet/src/java/org/apache/lucene/facet/simple/Constants.java deleted file mode 100644 index 3b311c89670..00000000000 --- a/lucene/facet/src/java/org/apache/lucene/facet/simple/Constants.java +++ /dev/null @@ -1,27 +0,0 @@ -package org.apache.lucene.facet.simple; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// nocommit jdocs -public final class Constants { - public static final char DEFAULT_DELIM_CHAR = '\u001F'; - - private Constants() { - // no - } -} diff --git a/lucene/facet/src/java/org/apache/lucene/facet/simple/FacetIndexWriter.java b/lucene/facet/src/java/org/apache/lucene/facet/simple/FacetIndexWriter.java index 0e5db53748b..f9c18304e72 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/simple/FacetIndexWriter.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/simple/FacetIndexWriter.java @@ -44,17 +44,11 @@ import org.apache.lucene.util.IntsRef; public class FacetIndexWriter extends IndexWriter { private final TaxonomyWriter taxoWriter; - private final char facetDelimChar; private final FacetsConfig facetsConfig; public FacetIndexWriter(Directory d, IndexWriterConfig conf, TaxonomyWriter taxoWriter, FacetsConfig facetsConfig) throws IOException { - this(d, conf, taxoWriter, facetsConfig, Constants.DEFAULT_DELIM_CHAR); - } - - public FacetIndexWriter(Directory d, IndexWriterConfig conf, TaxonomyWriter taxoWriter, FacetsConfig facetsConfig, char facetDelimChar) throws IOException { super(d, conf); this.taxoWriter = taxoWriter; - this.facetDelimChar = facetDelimChar; this.facetsConfig = facetsConfig; } @@ -175,7 +169,7 @@ public class FacetIndexWriter extends IndexWriter { // Drill down: for(int i=2;i<=cp.length;i++) { - addedIndexedFields.add(new StringField(indexedFieldName, cp.subpath(i).toString(facetDelimChar), Field.Store.NO)); + addedIndexedFields.add(new StringField(indexedFieldName, pathToString(cp.components, i), Field.Store.NO)); } } @@ -194,7 +188,7 @@ public class FacetIndexWriter extends IndexWriter { for(SortedSetDocValuesFacetField facetField : ent.getValue()) { FacetLabel cp = new FacetLabel(facetField.dim, facetField.label); - String fullPath = cp.toString(facetDelimChar); + String fullPath = pathToString(cp.components, cp.length); //System.out.println("add " + fullPath); // For facet counts: @@ -255,4 +249,72 @@ public class FacetIndexWriter extends IndexWriter { return new BytesRef(bytes, 0, upto); } + // nocommit move these constants / methods to Util? + + // Joins the path components together: + private static final char DELIM_CHAR = '\u001F'; + + // Escapes any occurrence of the path component inside the label: + private static final char ESCAPE_CHAR = '\u001E'; + + /** Turns a path into a string without stealing any + * characters. */ + public static String pathToString(String dim, String[] path) { + String[] fullPath = new String[1+path.length]; + fullPath[0] = dim; + System.arraycopy(path, 0, fullPath, 1, path.length); + return pathToString(fullPath, fullPath.length); + } + + public static String pathToString(String[] path) { + return pathToString(path, path.length); + } + + public static String pathToString(String[] path, int length) { + StringBuilder sb = new StringBuilder(); + for(int i=0;i parts = new ArrayList(); + int length = s.length(); + char[] buffer = new char[length]; + + int upto = 0; + boolean lastEscape = false; + for(int i=0;i fieldTypes = new ConcurrentHashMap(); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/simple/SimpleDrillDownQuery.java b/lucene/facet/src/java/org/apache/lucene/facet/simple/SimpleDrillDownQuery.java index 10bb64496a0..591fa85b049 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/simple/SimpleDrillDownQuery.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/simple/SimpleDrillDownQuery.java @@ -18,6 +18,7 @@ package org.apache.lucene.facet.simple; */ import java.io.IOException; +import java.util.Arrays; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; @@ -51,8 +52,8 @@ import org.apache.lucene.search.TermQuery; */ public final class SimpleDrillDownQuery extends Query { - private static Term term(String field, char delimChar, String dim, String[] path) { - return new Term(field, FacetLabel.create(dim, path).toString(delimChar)); + private static Term term(String field, String dim, String[] path) { + return new Term(field, FacetIndexWriter.pathToString(dim, path)); } private final FacetsConfig config; @@ -134,10 +135,8 @@ public final class SimpleDrillDownQuery extends Query { } String indexedField = config.getDimConfig(dim).indexedFieldName; - // nocommit pull this from FacetsConfig - char delimChar = Constants.DEFAULT_DELIM_CHAR; BooleanQuery bq = (BooleanQuery) q.getQuery(); - bq.add(new TermQuery(term(indexedField, delimChar, dim, path)), Occur.SHOULD); + bq.add(new TermQuery(term(indexedField, dim, path)), Occur.SHOULD); } /** Adds one dimension of drill downs; if you pass the same @@ -153,13 +152,11 @@ public final class SimpleDrillDownQuery extends Query { } String indexedField = config.getDimConfig(dim).indexedFieldName; - // nocommit pull this from FacetsConfig - char delimChar = Constants.DEFAULT_DELIM_CHAR; BooleanQuery bq = new BooleanQuery(true); // disable coord if (path.length == 0) { throw new IllegalArgumentException("must have at least one facet label under dim"); } - bq.add(new TermQuery(term(indexedField, delimChar, dim, path)), Occur.SHOULD); + bq.add(new TermQuery(term(indexedField, dim, path)), Occur.SHOULD); add(dim, bq); } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/simple/SortedSetDocValuesFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/simple/SortedSetDocValuesFacetCounts.java index 80b74791ff8..1efd51e5d84 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/simple/SortedSetDocValuesFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/simple/SortedSetDocValuesFacetCounts.java @@ -230,7 +230,7 @@ public class SortedSetDocValuesFacetCounts extends Facets { throw new IllegalArgumentException("path must be length=1"); } - int ord = (int) dv.lookupTerm(new BytesRef(dim + state.separator + path[0])); + int ord = (int) dv.lookupTerm(new BytesRef(FacetIndexWriter.pathToString(dim, path))); if (ord < 0) { return -1; } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/simple/SortedSetDocValuesReaderState.java b/lucene/facet/src/java/org/apache/lucene/facet/simple/SortedSetDocValuesReaderState.java index 823825d7453..6b5b587c6af 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/simple/SortedSetDocValuesReaderState.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/simple/SortedSetDocValuesReaderState.java @@ -52,8 +52,6 @@ public final class SortedSetDocValuesReaderState { private final AtomicReader topReader; private final int valueCount; public final IndexReader origReader; - public final char separator; - final String separatorRegex; /** Holds start/end range of ords, which maps to one * dimension (someday we may generalize it to map to @@ -74,21 +72,15 @@ public final class SortedSetDocValuesReaderState { private final Map prefixToOrdRange = new HashMap(); public SortedSetDocValuesReaderState(IndexReader reader) throws IOException { - this(reader, FacetsConfig.DEFAULT_INDEXED_FIELD_NAME, Constants.DEFAULT_DELIM_CHAR); - } - - public SortedSetDocValuesReaderState(IndexReader reader, String dvField) throws IOException { - this(reader, dvField, Constants.DEFAULT_DELIM_CHAR); + this(reader, FacetsConfig.DEFAULT_INDEXED_FIELD_NAME); } /** Create an instance, scanning the {@link * SortedSetDocValues} from the provided reader, with * default {@link FacetIndexingParams}. */ - public SortedSetDocValuesReaderState(IndexReader reader, String field, char delimChar) throws IOException { + public SortedSetDocValuesReaderState(IndexReader reader, String field) throws IOException { this.field = field; - this.separator = delimChar; - this.separatorRegex = Pattern.quote(Character.toString(separator)); this.origReader = reader; // We need this to create thread-safe MultiSortedSetDV @@ -116,7 +108,7 @@ public final class SortedSetDocValuesReaderState { // support arbitrary hierarchy: for(int ord=0;ord { * Returns a string representation of the path, separating components with the * given delimiter. */ + // nocommit remove public String toString(char delimiter) { if (length == 0) return "";