mirror of https://github.com/apache/lucene.git
LUCENE-5339: remove delim char
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5339@1542843 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
45a73c005e
commit
a872b82402
14
TODO
14
TODO
|
@ -2,10 +2,9 @@ nocommit this!
|
|||
|
||||
TODO
|
||||
- associations
|
||||
- nuke delimChar? can we escape instead?
|
||||
- wrap an IW instead of extending one?
|
||||
- cutover taxo writer/reader to pathToString/stringToPath
|
||||
- wrap an IW instead of extending one? or, FacetDocument?
|
||||
- re-enable ALL_BUT_DIM somehow?
|
||||
- SSDVValueSourceFacets?
|
||||
- we could put more stuff into the "schema", e.g. this field is
|
||||
sorted-set-DV and that one is taxo?
|
||||
- standardize on facet or facets (e.g. FacetIndexWriter)
|
||||
|
@ -13,9 +12,8 @@ TODO
|
|||
- how to do avg() agg?
|
||||
- test needsScores=true / valuesource associations
|
||||
- make FieldTypes optional (if all your dims are flat)?
|
||||
- add hierarchy to ssdv facets?
|
||||
- sparse faceting: allow skipping of certain dims?
|
||||
- maybe an interface/abstract class for "FacetResults"? has common
|
||||
API, ie to get top facets under a path, get all dims; then DS can
|
||||
use this?
|
||||
- consistently name things "dimension"? calling these fields is CONFUSING
|
||||
- later
|
||||
- SSDVValueSourceFacets?
|
||||
- add hierarchy to ssdv facets?
|
||||
- sparse faceting: allow skipping of certain dims?
|
||||
|
|
|
@ -1,27 +0,0 @@
|
|||
package org.apache.lucene.facet.simple;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// nocommit jdocs
|
||||
public final class Constants {
|
||||
public static final char DEFAULT_DELIM_CHAR = '\u001F';
|
||||
|
||||
private Constants() {
|
||||
// no
|
||||
}
|
||||
}
|
|
@ -44,17 +44,11 @@ import org.apache.lucene.util.IntsRef;
|
|||
public class FacetIndexWriter extends IndexWriter {
|
||||
|
||||
private final TaxonomyWriter taxoWriter;
|
||||
private final char facetDelimChar;
|
||||
private final FacetsConfig facetsConfig;
|
||||
|
||||
public FacetIndexWriter(Directory d, IndexWriterConfig conf, TaxonomyWriter taxoWriter, FacetsConfig facetsConfig) throws IOException {
|
||||
this(d, conf, taxoWriter, facetsConfig, Constants.DEFAULT_DELIM_CHAR);
|
||||
}
|
||||
|
||||
public FacetIndexWriter(Directory d, IndexWriterConfig conf, TaxonomyWriter taxoWriter, FacetsConfig facetsConfig, char facetDelimChar) throws IOException {
|
||||
super(d, conf);
|
||||
this.taxoWriter = taxoWriter;
|
||||
this.facetDelimChar = facetDelimChar;
|
||||
this.facetsConfig = facetsConfig;
|
||||
}
|
||||
|
||||
|
@ -175,7 +169,7 @@ public class FacetIndexWriter extends IndexWriter {
|
|||
|
||||
// Drill down:
|
||||
for(int i=2;i<=cp.length;i++) {
|
||||
addedIndexedFields.add(new StringField(indexedFieldName, cp.subpath(i).toString(facetDelimChar), Field.Store.NO));
|
||||
addedIndexedFields.add(new StringField(indexedFieldName, pathToString(cp.components, i), Field.Store.NO));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -194,7 +188,7 @@ public class FacetIndexWriter extends IndexWriter {
|
|||
|
||||
for(SortedSetDocValuesFacetField facetField : ent.getValue()) {
|
||||
FacetLabel cp = new FacetLabel(facetField.dim, facetField.label);
|
||||
String fullPath = cp.toString(facetDelimChar);
|
||||
String fullPath = pathToString(cp.components, cp.length);
|
||||
//System.out.println("add " + fullPath);
|
||||
|
||||
// For facet counts:
|
||||
|
@ -255,4 +249,72 @@ public class FacetIndexWriter extends IndexWriter {
|
|||
return new BytesRef(bytes, 0, upto);
|
||||
}
|
||||
|
||||
// nocommit move these constants / methods to Util?
|
||||
|
||||
// Joins the path components together:
|
||||
private static final char DELIM_CHAR = '\u001F';
|
||||
|
||||
// Escapes any occurrence of the path component inside the label:
|
||||
private static final char ESCAPE_CHAR = '\u001E';
|
||||
|
||||
/** Turns a path into a string without stealing any
|
||||
* characters. */
|
||||
public static String pathToString(String dim, String[] path) {
|
||||
String[] fullPath = new String[1+path.length];
|
||||
fullPath[0] = dim;
|
||||
System.arraycopy(path, 0, fullPath, 1, path.length);
|
||||
return pathToString(fullPath, fullPath.length);
|
||||
}
|
||||
|
||||
public static String pathToString(String[] path) {
|
||||
return pathToString(path, path.length);
|
||||
}
|
||||
|
||||
public static String pathToString(String[] path, int length) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for(int i=0;i<length;i++) {
|
||||
String s = path[i];
|
||||
int numChars = s.length();
|
||||
for(int j=0;j<numChars;j++) {
|
||||
char ch = s.charAt(j);
|
||||
if (ch == DELIM_CHAR || ch == ESCAPE_CHAR) {
|
||||
sb.append(ESCAPE_CHAR);
|
||||
}
|
||||
sb.append(ch);
|
||||
}
|
||||
sb.append(DELIM_CHAR);
|
||||
}
|
||||
|
||||
// Trim off last DELIM_CHAR:
|
||||
sb.setLength(sb.length()-1);
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/** Turns a result from previous call to {@link
|
||||
* #pathToString} back into the original {@code String[]}
|
||||
* without stealing any characters. */
|
||||
public static String[] stringToPath(String s) {
|
||||
List<String> parts = new ArrayList<String>();
|
||||
int length = s.length();
|
||||
char[] buffer = new char[length];
|
||||
|
||||
int upto = 0;
|
||||
boolean lastEscape = false;
|
||||
for(int i=0;i<length;i++) {
|
||||
char ch = s.charAt(i);
|
||||
if (lastEscape) {
|
||||
buffer[upto++] = ch;
|
||||
lastEscape = false;
|
||||
} else if (ch == ESCAPE_CHAR) {
|
||||
lastEscape = true;
|
||||
} else if (ch == DELIM_CHAR) {
|
||||
parts.add(new String(buffer, 0, upto));
|
||||
upto = 0;
|
||||
} else {
|
||||
buffer[upto++] = ch;
|
||||
}
|
||||
}
|
||||
assert !lastEscape;
|
||||
return parts.toArray(new String[parts.size()]);
|
||||
}
|
||||
}
|
|
@ -26,7 +26,6 @@ public class FacetsConfig {
|
|||
|
||||
public static final String DEFAULT_INDEXED_FIELD_NAME = "$facets";
|
||||
|
||||
// nocommit pull the delim char into there?
|
||||
// nocommit pull DimType into here (shai?)
|
||||
|
||||
private final Map<String,DimConfig> fieldTypes = new ConcurrentHashMap<String,DimConfig>();
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.facet.simple;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
@ -51,8 +52,8 @@ import org.apache.lucene.search.TermQuery;
|
|||
*/
|
||||
public final class SimpleDrillDownQuery extends Query {
|
||||
|
||||
private static Term term(String field, char delimChar, String dim, String[] path) {
|
||||
return new Term(field, FacetLabel.create(dim, path).toString(delimChar));
|
||||
private static Term term(String field, String dim, String[] path) {
|
||||
return new Term(field, FacetIndexWriter.pathToString(dim, path));
|
||||
}
|
||||
|
||||
private final FacetsConfig config;
|
||||
|
@ -134,10 +135,8 @@ public final class SimpleDrillDownQuery extends Query {
|
|||
}
|
||||
String indexedField = config.getDimConfig(dim).indexedFieldName;
|
||||
|
||||
// nocommit pull this from FacetsConfig
|
||||
char delimChar = Constants.DEFAULT_DELIM_CHAR;
|
||||
BooleanQuery bq = (BooleanQuery) q.getQuery();
|
||||
bq.add(new TermQuery(term(indexedField, delimChar, dim, path)), Occur.SHOULD);
|
||||
bq.add(new TermQuery(term(indexedField, dim, path)), Occur.SHOULD);
|
||||
}
|
||||
|
||||
/** Adds one dimension of drill downs; if you pass the same
|
||||
|
@ -153,13 +152,11 @@ public final class SimpleDrillDownQuery extends Query {
|
|||
}
|
||||
String indexedField = config.getDimConfig(dim).indexedFieldName;
|
||||
|
||||
// nocommit pull this from FacetsConfig
|
||||
char delimChar = Constants.DEFAULT_DELIM_CHAR;
|
||||
BooleanQuery bq = new BooleanQuery(true); // disable coord
|
||||
if (path.length == 0) {
|
||||
throw new IllegalArgumentException("must have at least one facet label under dim");
|
||||
}
|
||||
bq.add(new TermQuery(term(indexedField, delimChar, dim, path)), Occur.SHOULD);
|
||||
bq.add(new TermQuery(term(indexedField, dim, path)), Occur.SHOULD);
|
||||
|
||||
add(dim, bq);
|
||||
}
|
||||
|
|
|
@ -230,7 +230,7 @@ public class SortedSetDocValuesFacetCounts extends Facets {
|
|||
throw new IllegalArgumentException("path must be length=1");
|
||||
}
|
||||
|
||||
int ord = (int) dv.lookupTerm(new BytesRef(dim + state.separator + path[0]));
|
||||
int ord = (int) dv.lookupTerm(new BytesRef(FacetIndexWriter.pathToString(dim, path)));
|
||||
if (ord < 0) {
|
||||
return -1;
|
||||
}
|
||||
|
|
|
@ -52,8 +52,6 @@ public final class SortedSetDocValuesReaderState {
|
|||
private final AtomicReader topReader;
|
||||
private final int valueCount;
|
||||
public final IndexReader origReader;
|
||||
public final char separator;
|
||||
final String separatorRegex;
|
||||
|
||||
/** Holds start/end range of ords, which maps to one
|
||||
* dimension (someday we may generalize it to map to
|
||||
|
@ -74,21 +72,15 @@ public final class SortedSetDocValuesReaderState {
|
|||
private final Map<String,OrdRange> prefixToOrdRange = new HashMap<String,OrdRange>();
|
||||
|
||||
public SortedSetDocValuesReaderState(IndexReader reader) throws IOException {
|
||||
this(reader, FacetsConfig.DEFAULT_INDEXED_FIELD_NAME, Constants.DEFAULT_DELIM_CHAR);
|
||||
}
|
||||
|
||||
public SortedSetDocValuesReaderState(IndexReader reader, String dvField) throws IOException {
|
||||
this(reader, dvField, Constants.DEFAULT_DELIM_CHAR);
|
||||
this(reader, FacetsConfig.DEFAULT_INDEXED_FIELD_NAME);
|
||||
}
|
||||
|
||||
/** Create an instance, scanning the {@link
|
||||
* SortedSetDocValues} from the provided reader, with
|
||||
* default {@link FacetIndexingParams}. */
|
||||
public SortedSetDocValuesReaderState(IndexReader reader, String field, char delimChar) throws IOException {
|
||||
public SortedSetDocValuesReaderState(IndexReader reader, String field) throws IOException {
|
||||
|
||||
this.field = field;
|
||||
this.separator = delimChar;
|
||||
this.separatorRegex = Pattern.quote(Character.toString(separator));
|
||||
this.origReader = reader;
|
||||
|
||||
// We need this to create thread-safe MultiSortedSetDV
|
||||
|
@ -116,7 +108,7 @@ public final class SortedSetDocValuesReaderState {
|
|||
// support arbitrary hierarchy:
|
||||
for(int ord=0;ord<valueCount;ord++) {
|
||||
dv.lookupOrd(ord, spare);
|
||||
String[] components = spare.utf8ToString().split(separatorRegex, 2);
|
||||
String[] components = FacetIndexWriter.stringToPath(spare.utf8ToString());
|
||||
if (components.length != 2) {
|
||||
throw new IllegalArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + spare.utf8ToString());
|
||||
}
|
||||
|
|
|
@ -274,6 +274,7 @@ public class FacetLabel implements Comparable<FacetLabel> {
|
|||
* Returns a string representation of the path, separating components with the
|
||||
* given delimiter.
|
||||
*/
|
||||
// nocommit remove
|
||||
public String toString(char delimiter) {
|
||||
if (length == 0) return "";
|
||||
|
||||
|
|
Loading…
Reference in New Issue