LUCENE-5339: remove delim char

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5339@1542843 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2013-11-17 21:57:42 +00:00
parent 45a73c005e
commit a872b82402
8 changed files with 86 additions and 64 deletions

14
TODO
View File

@ -2,10 +2,9 @@ nocommit this!
TODO
- associations
- nuke delimChar? can we escape instead?
- wrap an IW instead of extending one?
- cutover taxo writer/reader to pathToString/stringToPath
- wrap an IW instead of extending one? or, FacetDocument?
- re-enable ALL_BUT_DIM somehow?
- SSDVValueSourceFacets?
- we could put more stuff into the "schema", e.g. this field is
sorted-set-DV and that one is taxo?
- standardize on facet or facets (e.g. FacetIndexWriter)
@ -13,9 +12,8 @@ TODO
- how to do avg() agg?
- test needsScores=true / valuesource associations
- make FieldTypes optional (if all your dims are flat)?
- add hierarchy to ssdv facets?
- sparse faceting: allow skipping of certain dims?
- maybe an interface/abstract class for "FacetResults"? has common
API, ie to get top facets under a path, get all dims; then DS can
use this?
- consistently name things "dimension"? calling these fields is CONFUSING
- later
- SSDVValueSourceFacets?
- add hierarchy to ssdv facets?
- sparse faceting: allow skipping of certain dims?

View File

@ -1,27 +0,0 @@
package org.apache.lucene.facet.simple;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// nocommit jdocs
public final class Constants {
public static final char DEFAULT_DELIM_CHAR = '\u001F';
private Constants() {
// no
}
}

View File

@ -44,17 +44,11 @@ import org.apache.lucene.util.IntsRef;
public class FacetIndexWriter extends IndexWriter {
private final TaxonomyWriter taxoWriter;
private final char facetDelimChar;
private final FacetsConfig facetsConfig;
public FacetIndexWriter(Directory d, IndexWriterConfig conf, TaxonomyWriter taxoWriter, FacetsConfig facetsConfig) throws IOException {
this(d, conf, taxoWriter, facetsConfig, Constants.DEFAULT_DELIM_CHAR);
}
public FacetIndexWriter(Directory d, IndexWriterConfig conf, TaxonomyWriter taxoWriter, FacetsConfig facetsConfig, char facetDelimChar) throws IOException {
super(d, conf);
this.taxoWriter = taxoWriter;
this.facetDelimChar = facetDelimChar;
this.facetsConfig = facetsConfig;
}
@ -175,7 +169,7 @@ public class FacetIndexWriter extends IndexWriter {
// Drill down:
for(int i=2;i<=cp.length;i++) {
addedIndexedFields.add(new StringField(indexedFieldName, cp.subpath(i).toString(facetDelimChar), Field.Store.NO));
addedIndexedFields.add(new StringField(indexedFieldName, pathToString(cp.components, i), Field.Store.NO));
}
}
@ -194,7 +188,7 @@ public class FacetIndexWriter extends IndexWriter {
for(SortedSetDocValuesFacetField facetField : ent.getValue()) {
FacetLabel cp = new FacetLabel(facetField.dim, facetField.label);
String fullPath = cp.toString(facetDelimChar);
String fullPath = pathToString(cp.components, cp.length);
//System.out.println("add " + fullPath);
// For facet counts:
@ -255,4 +249,72 @@ public class FacetIndexWriter extends IndexWriter {
return new BytesRef(bytes, 0, upto);
}
// nocommit move these constants / methods to Util?
// Joins the path components together:
private static final char DELIM_CHAR = '\u001F';
// Escapes any occurrence of the path component inside the label:
private static final char ESCAPE_CHAR = '\u001E';
/** Turns a path into a string without stealing any
* characters. */
public static String pathToString(String dim, String[] path) {
String[] fullPath = new String[1+path.length];
fullPath[0] = dim;
System.arraycopy(path, 0, fullPath, 1, path.length);
return pathToString(fullPath, fullPath.length);
}
public static String pathToString(String[] path) {
return pathToString(path, path.length);
}
public static String pathToString(String[] path, int length) {
StringBuilder sb = new StringBuilder();
for(int i=0;i<length;i++) {
String s = path[i];
int numChars = s.length();
for(int j=0;j<numChars;j++) {
char ch = s.charAt(j);
if (ch == DELIM_CHAR || ch == ESCAPE_CHAR) {
sb.append(ESCAPE_CHAR);
}
sb.append(ch);
}
sb.append(DELIM_CHAR);
}
// Trim off last DELIM_CHAR:
sb.setLength(sb.length()-1);
return sb.toString();
}
/** Turns a result from previous call to {@link
* #pathToString} back into the original {@code String[]}
* without stealing any characters. */
public static String[] stringToPath(String s) {
List<String> parts = new ArrayList<String>();
int length = s.length();
char[] buffer = new char[length];
int upto = 0;
boolean lastEscape = false;
for(int i=0;i<length;i++) {
char ch = s.charAt(i);
if (lastEscape) {
buffer[upto++] = ch;
lastEscape = false;
} else if (ch == ESCAPE_CHAR) {
lastEscape = true;
} else if (ch == DELIM_CHAR) {
parts.add(new String(buffer, 0, upto));
upto = 0;
} else {
buffer[upto++] = ch;
}
}
assert !lastEscape;
return parts.toArray(new String[parts.size()]);
}
}

View File

@ -26,7 +26,6 @@ public class FacetsConfig {
public static final String DEFAULT_INDEXED_FIELD_NAME = "$facets";
// nocommit pull the delim char into there?
// nocommit pull DimType into here (shai?)
private final Map<String,DimConfig> fieldTypes = new ConcurrentHashMap<String,DimConfig>();

View File

@ -18,6 +18,7 @@ package org.apache.lucene.facet.simple;
*/
import java.io.IOException;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
@ -51,8 +52,8 @@ import org.apache.lucene.search.TermQuery;
*/
public final class SimpleDrillDownQuery extends Query {
private static Term term(String field, char delimChar, String dim, String[] path) {
return new Term(field, FacetLabel.create(dim, path).toString(delimChar));
private static Term term(String field, String dim, String[] path) {
return new Term(field, FacetIndexWriter.pathToString(dim, path));
}
private final FacetsConfig config;
@ -134,10 +135,8 @@ public final class SimpleDrillDownQuery extends Query {
}
String indexedField = config.getDimConfig(dim).indexedFieldName;
// nocommit pull this from FacetsConfig
char delimChar = Constants.DEFAULT_DELIM_CHAR;
BooleanQuery bq = (BooleanQuery) q.getQuery();
bq.add(new TermQuery(term(indexedField, delimChar, dim, path)), Occur.SHOULD);
bq.add(new TermQuery(term(indexedField, dim, path)), Occur.SHOULD);
}
/** Adds one dimension of drill downs; if you pass the same
@ -153,13 +152,11 @@ public final class SimpleDrillDownQuery extends Query {
}
String indexedField = config.getDimConfig(dim).indexedFieldName;
// nocommit pull this from FacetsConfig
char delimChar = Constants.DEFAULT_DELIM_CHAR;
BooleanQuery bq = new BooleanQuery(true); // disable coord
if (path.length == 0) {
throw new IllegalArgumentException("must have at least one facet label under dim");
}
bq.add(new TermQuery(term(indexedField, delimChar, dim, path)), Occur.SHOULD);
bq.add(new TermQuery(term(indexedField, dim, path)), Occur.SHOULD);
add(dim, bq);
}

View File

@ -230,7 +230,7 @@ public class SortedSetDocValuesFacetCounts extends Facets {
throw new IllegalArgumentException("path must be length=1");
}
int ord = (int) dv.lookupTerm(new BytesRef(dim + state.separator + path[0]));
int ord = (int) dv.lookupTerm(new BytesRef(FacetIndexWriter.pathToString(dim, path)));
if (ord < 0) {
return -1;
}

View File

@ -52,8 +52,6 @@ public final class SortedSetDocValuesReaderState {
private final AtomicReader topReader;
private final int valueCount;
public final IndexReader origReader;
public final char separator;
final String separatorRegex;
/** Holds start/end range of ords, which maps to one
* dimension (someday we may generalize it to map to
@ -74,21 +72,15 @@ public final class SortedSetDocValuesReaderState {
private final Map<String,OrdRange> prefixToOrdRange = new HashMap<String,OrdRange>();
public SortedSetDocValuesReaderState(IndexReader reader) throws IOException {
this(reader, FacetsConfig.DEFAULT_INDEXED_FIELD_NAME, Constants.DEFAULT_DELIM_CHAR);
}
public SortedSetDocValuesReaderState(IndexReader reader, String dvField) throws IOException {
this(reader, dvField, Constants.DEFAULT_DELIM_CHAR);
this(reader, FacetsConfig.DEFAULT_INDEXED_FIELD_NAME);
}
/** Create an instance, scanning the {@link
* SortedSetDocValues} from the provided reader, with
* default {@link FacetIndexingParams}. */
public SortedSetDocValuesReaderState(IndexReader reader, String field, char delimChar) throws IOException {
public SortedSetDocValuesReaderState(IndexReader reader, String field) throws IOException {
this.field = field;
this.separator = delimChar;
this.separatorRegex = Pattern.quote(Character.toString(separator));
this.origReader = reader;
// We need this to create thread-safe MultiSortedSetDV
@ -116,7 +108,7 @@ public final class SortedSetDocValuesReaderState {
// support arbitrary hierarchy:
for(int ord=0;ord<valueCount;ord++) {
dv.lookupOrd(ord, spare);
String[] components = spare.utf8ToString().split(separatorRegex, 2);
String[] components = FacetIndexWriter.stringToPath(spare.utf8ToString());
if (components.length != 2) {
throw new IllegalArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + spare.utf8ToString());
}

View File

@ -274,6 +274,7 @@ public class FacetLabel implements Comparable<FacetLabel> {
* Returns a string representation of the path, separating components with the
* given delimiter.
*/
// nocommit remove
public String toString(char delimiter) {
if (length == 0) return "";