mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-02-26 06:46:10 +00:00
Analysis: Word Delimiter Token Filter, closes #918.
This commit is contained in:
parent
465036655f
commit
563ad625c0
2
.idea/dictionaries/kimchy.xml
generated
2
.idea/dictionaries/kimchy.xml
generated
@ -23,6 +23,7 @@
|
||||
<w>calc</w>
|
||||
<w>camelcase</w>
|
||||
<w>canonicalhost</w>
|
||||
<w>catenate</w>
|
||||
<w>charfilter</w>
|
||||
<w>charsets</w>
|
||||
<w>checksum</w>
|
||||
@ -107,6 +108,7 @@
|
||||
<w>ngram</w>
|
||||
<w>noop</w>
|
||||
<w>nospawn</w>
|
||||
<w>numerics</w>
|
||||
<w>param</w>
|
||||
<w>params</w>
|
||||
<w>persistency</w>
|
||||
|
@ -0,0 +1,574 @@
|
||||
/*
|
||||
* Licensed to Elastic Search and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. Elastic Search licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Splits words into subwords and performs optional transformations on subword groups.
|
||||
* Words are split into subwords with the following rules:
|
||||
* - split on intra-word delimiters (by default, all non alpha-numeric characters).
|
||||
* - "Wi-Fi" -> "Wi", "Fi"
|
||||
* - split on case transitions
|
||||
* - "PowerShot" -> "Power", "Shot"
|
||||
* - split on letter-number transitions
|
||||
* - "SD500" -> "SD", "500"
|
||||
* - leading and trailing intra-word delimiters on each subword are ignored
|
||||
* - "//hello---there, 'dude'" -> "hello", "there", "dude"
|
||||
* - trailing "'s" are removed for each subword
|
||||
* - "O'Neil's" -> "O", "Neil"
|
||||
* - Note: this step isn't performed in a separate filter because of possible subword combinations.
|
||||
*
|
||||
* The <b>combinations</b> parameter affects how subwords are combined:
|
||||
* - combinations="0" causes no subword combinations.
|
||||
* - "PowerShot" -> 0:"Power", 1:"Shot" (0 and 1 are the token positions)
|
||||
* - combinations="1" means that in addition to the subwords, maximum runs of non-numeric subwords are catenated and produced at the same position of the last subword in the run.
|
||||
* - "PowerShot" -> 0:"Power", 1:"Shot" 1:"PowerShot"
|
||||
* - "A's+B's&C's" -> 0:"A", 1:"B", 2:"C", 2:"ABC"
|
||||
* - "Super-Duper-XL500-42-AutoCoder!" -> 0:"Super", 1:"Duper", 2:"XL", 2:"SuperDuperXL", 3:"500" 4:"42", 5:"Auto", 6:"Coder", 6:"AutoCoder"
|
||||
*
|
||||
* One use for WordDelimiterFilter is to help match words with different subword delimiters.
|
||||
* For example, if the source text contained "wi-fi" one may want "wifi" "WiFi" "wi-fi" "wi+fi" queries to all match.
|
||||
* One way of doing so is to specify combinations="1" in the analyzer used for indexing, and combinations="0" (the default)
|
||||
* in the analyzer used for querying. Given that the current StandardTokenizer immediately removes many intra-word
|
||||
* delimiters, it is recommended that this filter be used after a tokenizer that does not do this (such as WhitespaceTokenizer).
|
||||
*/
|
||||
// LUCENE MONITOR: Part of Lucene 4.0, once we upgrade, remove it
|
||||
public final class WordDelimiterFilter extends TokenFilter {
|
||||
|
||||
public static final int LOWER = 0x01;
|
||||
public static final int UPPER = 0x02;
|
||||
public static final int DIGIT = 0x04;
|
||||
public static final int SUBWORD_DELIM = 0x08;
|
||||
|
||||
// combinations: for testing, not for setting bits
|
||||
public static final int ALPHA = 0x03;
|
||||
public static final int ALPHANUM = 0x07;
|
||||
|
||||
/**
|
||||
* If true, causes parts of words to be generated:
|
||||
* <p/>
|
||||
* "PowerShot" => "Power" "Shot"
|
||||
*/
|
||||
final boolean generateWordParts;
|
||||
|
||||
/**
|
||||
* If true, causes number subwords to be generated:
|
||||
* <p/>
|
||||
* "500-42" => "500" "42"
|
||||
*/
|
||||
final boolean generateNumberParts;
|
||||
|
||||
/**
|
||||
* If true, causes maximum runs of word parts to be catenated:
|
||||
* <p/>
|
||||
* "wi-fi" => "wifi"
|
||||
*/
|
||||
final boolean catenateWords;
|
||||
|
||||
/**
|
||||
* If true, causes maximum runs of number parts to be catenated:
|
||||
* <p/>
|
||||
* "500-42" => "50042"
|
||||
*/
|
||||
final boolean catenateNumbers;
|
||||
|
||||
/**
|
||||
* If true, causes all subword parts to be catenated:
|
||||
* <p/>
|
||||
* "wi-fi-4000" => "wifi4000"
|
||||
*/
|
||||
final boolean catenateAll;
|
||||
|
||||
/**
|
||||
* If true, original words are preserved and added to the subword list (Defaults to false)
|
||||
* <p/>
|
||||
* "500-42" => "500" "42" "500-42"
|
||||
*/
|
||||
final boolean preserveOriginal;
|
||||
|
||||
/**
|
||||
* If not null is the set of tokens to protect from being delimited
|
||||
*/
|
||||
final CharArraySet protWords;
|
||||
|
||||
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
|
||||
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
|
||||
|
||||
// used for iterating word delimiter breaks
|
||||
private final WordDelimiterIterator iterator;
|
||||
|
||||
// used for concatenating runs of similar typed subwords (word,number)
|
||||
private final WordDelimiterConcatenation concat = new WordDelimiterConcatenation();
|
||||
// number of subwords last output by concat.
|
||||
private int lastConcatCount = 0;
|
||||
|
||||
// used for catenate all
|
||||
private final WordDelimiterConcatenation concatAll = new WordDelimiterConcatenation();
|
||||
|
||||
// used for accumulating position increment gaps
|
||||
private int accumPosInc = 0;
|
||||
|
||||
private char savedBuffer[] = new char[1024];
|
||||
private int savedStartOffset;
|
||||
private int savedEndOffset;
|
||||
private String savedType;
|
||||
private boolean hasSavedState = false;
|
||||
// if length by start + end offsets doesn't match the term text then assume
|
||||
// this is a synonym and don't adjust the offsets.
|
||||
private boolean hasIllegalOffsets = false;
|
||||
|
||||
// for a run of the same subword type within a word, have we output anything?
|
||||
private boolean hasOutputToken = false;
|
||||
// when preserve original is on, have we output any token following it?
|
||||
// this token must have posInc=0!
|
||||
private boolean hasOutputFollowingOriginal = false;
|
||||
|
||||
/**
|
||||
* @param in Token stream to be filtered.
|
||||
* @param charTypeTable
|
||||
* @param generateWordParts If 1, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
|
||||
* @param generateNumberParts If 1, causes number subwords to be generated: "500-42" => "500" "42"
|
||||
* @param catenateWords 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
|
||||
* @param catenateNumbers If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042"
|
||||
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
|
||||
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
|
||||
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
|
||||
* @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
|
||||
* @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
|
||||
* @param protWords If not null is the set of tokens to protect from being delimited
|
||||
*/
|
||||
public WordDelimiterFilter(TokenStream in,
|
||||
byte[] charTypeTable,
|
||||
int generateWordParts,
|
||||
int generateNumberParts,
|
||||
int catenateWords,
|
||||
int catenateNumbers,
|
||||
int catenateAll,
|
||||
int splitOnCaseChange,
|
||||
int preserveOriginal,
|
||||
int splitOnNumerics,
|
||||
int stemEnglishPossessive,
|
||||
CharArraySet protWords) {
|
||||
super(in);
|
||||
this.generateWordParts = generateWordParts != 0;
|
||||
this.generateNumberParts = generateNumberParts != 0;
|
||||
this.catenateWords = catenateWords != 0;
|
||||
this.catenateNumbers = catenateNumbers != 0;
|
||||
this.catenateAll = catenateAll != 0;
|
||||
this.preserveOriginal = preserveOriginal != 0;
|
||||
this.protWords = protWords;
|
||||
this.iterator = new WordDelimiterIterator(charTypeTable, splitOnCaseChange != 0, splitOnNumerics != 0, stemEnglishPossessive != 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param in Token stream to be filtered.
|
||||
* @param generateWordParts If 1, causes parts of words to be generated: "PowerShot", "Power-Shot" => "Power" "Shot"
|
||||
* @param generateNumberParts If 1, causes number subwords to be generated: "500-42" => "500" "42"
|
||||
* @param catenateWords 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
|
||||
* @param catenateNumbers If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042"
|
||||
* @param catenateAll If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
|
||||
* @param splitOnCaseChange 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
|
||||
* @param preserveOriginal If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
|
||||
* @param splitOnNumerics 1, causes "j2se" to be three tokens; "j" "2" "se"
|
||||
* @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
|
||||
* @param protWords If not null is the set of tokens to protect from being delimited
|
||||
*/
|
||||
public WordDelimiterFilter(TokenStream in,
|
||||
int generateWordParts,
|
||||
int generateNumberParts,
|
||||
int catenateWords,
|
||||
int catenateNumbers,
|
||||
int catenateAll,
|
||||
int splitOnCaseChange,
|
||||
int preserveOriginal,
|
||||
int splitOnNumerics,
|
||||
int stemEnglishPossessive,
|
||||
CharArraySet protWords) {
|
||||
this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, generateWordParts, generateNumberParts, catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, stemEnglishPossessive, protWords);
|
||||
}
|
||||
|
||||
public boolean incrementToken() throws IOException {
|
||||
while (true) {
|
||||
if (!hasSavedState) {
|
||||
// process a new input word
|
||||
if (!input.incrementToken()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int termLength = termAttribute.length();
|
||||
char[] termBuffer = termAttribute.buffer();
|
||||
|
||||
accumPosInc += posIncAttribute.getPositionIncrement();
|
||||
|
||||
iterator.setText(termBuffer, termLength);
|
||||
iterator.next();
|
||||
|
||||
// word of no delimiters, or protected word: just return it
|
||||
if ((iterator.current == 0 && iterator.end == termLength) ||
|
||||
(protWords != null && protWords.contains(termBuffer, 0, termLength))) {
|
||||
posIncAttribute.setPositionIncrement(accumPosInc);
|
||||
accumPosInc = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
// word of simply delimiters
|
||||
if (iterator.end == WordDelimiterIterator.DONE && !preserveOriginal) {
|
||||
// if the posInc is 1, simply ignore it in the accumulation
|
||||
if (posIncAttribute.getPositionIncrement() == 1) {
|
||||
accumPosInc--;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
saveState();
|
||||
|
||||
hasOutputToken = false;
|
||||
hasOutputFollowingOriginal = !preserveOriginal;
|
||||
lastConcatCount = 0;
|
||||
|
||||
if (preserveOriginal) {
|
||||
posIncAttribute.setPositionIncrement(accumPosInc);
|
||||
accumPosInc = 0;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// at the end of the string, output any concatenations
|
||||
if (iterator.end == WordDelimiterIterator.DONE) {
|
||||
if (!concat.isEmpty()) {
|
||||
if (flushConcatenation(concat)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!concatAll.isEmpty()) {
|
||||
// only if we haven't output this same combo above!
|
||||
if (concatAll.subwordCount > lastConcatCount) {
|
||||
concatAll.writeAndClear();
|
||||
return true;
|
||||
}
|
||||
concatAll.clear();
|
||||
}
|
||||
|
||||
// no saved concatenations, on to the next input word
|
||||
hasSavedState = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
// word surrounded by delimiters: always output
|
||||
if (iterator.isSingleWord()) {
|
||||
generatePart(true);
|
||||
iterator.next();
|
||||
return true;
|
||||
}
|
||||
|
||||
int wordType = iterator.type();
|
||||
|
||||
// do we already have queued up incompatible concatenations?
|
||||
if (!concat.isEmpty() && (concat.type & wordType) == 0) {
|
||||
if (flushConcatenation(concat)) {
|
||||
hasOutputToken = false;
|
||||
return true;
|
||||
}
|
||||
hasOutputToken = false;
|
||||
}
|
||||
|
||||
// add subwords depending upon options
|
||||
if (shouldConcatenate(wordType)) {
|
||||
if (concat.isEmpty()) {
|
||||
concat.type = wordType;
|
||||
}
|
||||
concatenate(concat);
|
||||
}
|
||||
|
||||
// add all subwords (catenateAll)
|
||||
if (catenateAll) {
|
||||
concatenate(concatAll);
|
||||
}
|
||||
|
||||
// if we should output the word or number part
|
||||
if (shouldGenerateParts(wordType)) {
|
||||
generatePart(false);
|
||||
iterator.next();
|
||||
return true;
|
||||
}
|
||||
|
||||
iterator.next();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
hasSavedState = false;
|
||||
concat.clear();
|
||||
concatAll.clear();
|
||||
accumPosInc = 0;
|
||||
}
|
||||
|
||||
// ================================================= Helper Methods ================================================
|
||||
|
||||
/**
|
||||
* Saves the existing attribute states
|
||||
*/
|
||||
private void saveState() {
|
||||
// otherwise, we have delimiters, save state
|
||||
savedStartOffset = offsetAttribute.startOffset();
|
||||
savedEndOffset = offsetAttribute.endOffset();
|
||||
// if length by start + end offsets doesn't match the term text then assume this is a synonym and don't adjust the offsets.
|
||||
hasIllegalOffsets = (savedEndOffset - savedStartOffset != termAttribute.length());
|
||||
savedType = typeAttribute.type();
|
||||
|
||||
if (savedBuffer.length < termAttribute.length()) {
|
||||
savedBuffer = new char[ArrayUtil.oversize(termAttribute.length(), RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||
}
|
||||
|
||||
System.arraycopy(termAttribute.buffer(), 0, savedBuffer, 0, termAttribute.length());
|
||||
iterator.text = savedBuffer;
|
||||
|
||||
hasSavedState = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Flushes the given WordDelimiterConcatenation by either writing its concat and then clearing, or just clearing.
|
||||
*
|
||||
* @param concatenation WordDelimiterConcatenation that will be flushed
|
||||
* @return {@code true} if the concatenation was written before it was cleared, {@code} false otherwise
|
||||
*/
|
||||
private boolean flushConcatenation(WordDelimiterConcatenation concatenation) {
|
||||
lastConcatCount = concatenation.subwordCount;
|
||||
if (concatenation.subwordCount != 1 || !shouldGenerateParts(concatenation.type)) {
|
||||
concatenation.writeAndClear();
|
||||
return true;
|
||||
}
|
||||
concatenation.clear();
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether to concatenate a word or number if the current word is the given type
|
||||
*
|
||||
* @param wordType Type of the current word used to determine if it should be concatenated
|
||||
* @return {@code true} if concatenation should occur, {@code false} otherwise
|
||||
*/
|
||||
private boolean shouldConcatenate(int wordType) {
|
||||
return (catenateWords && isAlpha(wordType)) || (catenateNumbers && isDigit(wordType));
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether a word/number part should be generated for a word of the given type
|
||||
*
|
||||
* @param wordType Type of the word used to determine if a word/number part should be generated
|
||||
* @return {@code true} if a word/number part should be generated, {@code false} otherwise
|
||||
*/
|
||||
private boolean shouldGenerateParts(int wordType) {
|
||||
return (generateWordParts && isAlpha(wordType)) || (generateNumberParts && isDigit(wordType));
|
||||
}
|
||||
|
||||
/**
|
||||
* Concatenates the saved buffer to the given WordDelimiterConcatenation
|
||||
*
|
||||
* @param concatenation WordDelimiterConcatenation to concatenate the buffer to
|
||||
*/
|
||||
private void concatenate(WordDelimiterConcatenation concatenation) {
|
||||
if (concatenation.isEmpty()) {
|
||||
concatenation.startOffset = savedStartOffset + iterator.current;
|
||||
}
|
||||
concatenation.append(savedBuffer, iterator.current, iterator.end - iterator.current);
|
||||
concatenation.endOffset = savedStartOffset + iterator.end;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates a word/number part, updating the appropriate attributes
|
||||
*
|
||||
* @param isSingleWord {@code true} if the generation is occurring from a single word, {@code false} otherwise
|
||||
*/
|
||||
private void generatePart(boolean isSingleWord) {
|
||||
clearAttributes();
|
||||
termAttribute.copyBuffer(savedBuffer, iterator.current, iterator.end - iterator.current);
|
||||
|
||||
int startOffSet = (isSingleWord || !hasIllegalOffsets) ? savedStartOffset + iterator.current : savedStartOffset;
|
||||
int endOffSet = (hasIllegalOffsets) ? savedEndOffset : savedStartOffset + iterator.end;
|
||||
|
||||
offsetAttribute.setOffset(startOffSet, endOffSet);
|
||||
posIncAttribute.setPositionIncrement(position(false));
|
||||
typeAttribute.setType(savedType);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the position increment gap for a subword or concatenation
|
||||
*
|
||||
* @param inject true if this token wants to be injected
|
||||
* @return position increment gap
|
||||
*/
|
||||
private int position(boolean inject) {
|
||||
int posInc = accumPosInc;
|
||||
|
||||
if (hasOutputToken) {
|
||||
accumPosInc = 0;
|
||||
return inject ? 0 : Math.max(1, posInc);
|
||||
}
|
||||
|
||||
hasOutputToken = true;
|
||||
|
||||
if (!hasOutputFollowingOriginal) {
|
||||
// the first token following the original is 0 regardless
|
||||
hasOutputFollowingOriginal = true;
|
||||
return 0;
|
||||
}
|
||||
// clear the accumulated position increment
|
||||
accumPosInc = 0;
|
||||
return Math.max(1, posInc);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the given word type includes {@link #ALPHA}
|
||||
*
|
||||
* @param type Word type to check
|
||||
* @return {@code true} if the type contains ALPHA, {@code false} otherwise
|
||||
*/
|
||||
static boolean isAlpha(int type) {
|
||||
return (type & ALPHA) != 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the given word type includes {@link #DIGIT}
|
||||
*
|
||||
* @param type Word type to check
|
||||
* @return {@code true} if the type contains DIGIT, {@code false} otherwise
|
||||
*/
|
||||
static boolean isDigit(int type) {
|
||||
return (type & DIGIT) != 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the given word type includes {@link #SUBWORD_DELIM}
|
||||
*
|
||||
* @param type Word type to check
|
||||
* @return {@code true} if the type contains SUBWORD_DELIM, {@code false} otherwise
|
||||
*/
|
||||
static boolean isSubwordDelim(int type) {
|
||||
return (type & SUBWORD_DELIM) != 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the given word type includes {@link #UPPER}
|
||||
*
|
||||
* @param type Word type to check
|
||||
* @return {@code true} if the type contains UPPER, {@code false} otherwise
|
||||
*/
|
||||
static boolean isUpper(int type) {
|
||||
return (type & UPPER) != 0;
|
||||
}
|
||||
|
||||
// ================================================= Inner Classes =================================================
|
||||
|
||||
/**
|
||||
* A WDF concatenated 'run'
|
||||
*/
|
||||
final class WordDelimiterConcatenation {
|
||||
final StringBuilder buffer = new StringBuilder();
|
||||
int startOffset;
|
||||
int endOffset;
|
||||
int type;
|
||||
int subwordCount;
|
||||
|
||||
/**
|
||||
* Appends the given text of the given length, to the concetenation at the given offset
|
||||
*
|
||||
* @param text Text to append
|
||||
* @param offset Offset in the concetenation to add the text
|
||||
* @param length Length of the text to append
|
||||
*/
|
||||
void append(char text[], int offset, int length) {
|
||||
buffer.append(text, offset, length);
|
||||
subwordCount++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes the concatenation to the attributes
|
||||
*/
|
||||
void write() {
|
||||
clearAttributes();
|
||||
if (termAttribute.length() < buffer.length()) {
|
||||
termAttribute.resizeBuffer(buffer.length());
|
||||
}
|
||||
char termbuffer[] = termAttribute.buffer();
|
||||
|
||||
buffer.getChars(0, buffer.length(), termbuffer, 0);
|
||||
termAttribute.setLength(buffer.length());
|
||||
|
||||
if (hasIllegalOffsets) {
|
||||
offsetAttribute.setOffset(savedStartOffset, savedEndOffset);
|
||||
} else {
|
||||
offsetAttribute.setOffset(startOffset, endOffset);
|
||||
}
|
||||
posIncAttribute.setPositionIncrement(position(true));
|
||||
typeAttribute.setType(savedType);
|
||||
accumPosInc = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if the concatenation is empty
|
||||
*
|
||||
* @return {@code true} if the concatenation is empty, {@code false} otherwise
|
||||
*/
|
||||
boolean isEmpty() {
|
||||
return buffer.length() == 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clears the concatenation and resets its state
|
||||
*/
|
||||
void clear() {
|
||||
buffer.setLength(0);
|
||||
startOffset = endOffset = type = subwordCount = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience method for the common scenario of having to write the concetenation and then clearing its state
|
||||
*/
|
||||
void writeAndClear() {
|
||||
write();
|
||||
clear();
|
||||
}
|
||||
}
|
||||
// questions:
|
||||
// negative numbers? -42 indexed as just 42?
|
||||
// dollar sign? $42
|
||||
// percent sign? 33%
|
||||
// downsides: if source text is "powershot" then a query of "PowerShot" won't match!
|
||||
}
|
@ -0,0 +1,341 @@
|
||||
/*
|
||||
* Licensed to Elastic Search and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. Elastic Search licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;
|
||||
|
||||
/**
|
||||
* A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterFilter rules.
|
||||
*
|
||||
* @lucene.internal
|
||||
*/
|
||||
public final class WordDelimiterIterator {
|
||||
|
||||
/**
|
||||
* Indicates the end of iteration
|
||||
*/
|
||||
public static final int DONE = -1;
|
||||
|
||||
public static final byte[] DEFAULT_WORD_DELIM_TABLE;
|
||||
|
||||
char text[];
|
||||
int length;
|
||||
|
||||
/**
|
||||
* start position of text, excluding leading delimiters
|
||||
*/
|
||||
int startBounds;
|
||||
/**
|
||||
* end position of text, excluding trailing delimiters
|
||||
*/
|
||||
int endBounds;
|
||||
|
||||
/**
|
||||
* Beginning of subword
|
||||
*/
|
||||
int current;
|
||||
/**
|
||||
* End of subword
|
||||
*/
|
||||
int end;
|
||||
|
||||
/* does this string end with a possessive such as 's */
|
||||
private boolean hasFinalPossessive = false;
|
||||
|
||||
/**
|
||||
* If false, causes case changes to be ignored (subwords will only be generated
|
||||
* given SUBWORD_DELIM tokens). (Defaults to true)
|
||||
*/
|
||||
final boolean splitOnCaseChange;
|
||||
|
||||
/**
|
||||
* If false, causes numeric changes to be ignored (subwords will only be generated
|
||||
* given SUBWORD_DELIM tokens). (Defaults to true)
|
||||
*/
|
||||
final boolean splitOnNumerics;
|
||||
|
||||
/**
|
||||
* If true, causes trailing "'s" to be removed for each subword. (Defaults to true)
|
||||
* <p/>
|
||||
* "O'Neil's" => "O", "Neil"
|
||||
*/
|
||||
final boolean stemEnglishPossessive;
|
||||
|
||||
private final byte[] charTypeTable;
|
||||
|
||||
/**
|
||||
* if true, need to skip over a possessive found in the last call to next()
|
||||
*/
|
||||
private boolean skipPossessive = false;
|
||||
|
||||
// TODO: should there be a WORD_DELIM category for chars that only separate words (no catenation of subwords will be
|
||||
// done if separated by these chars?) "," would be an obvious candidate...
|
||||
static {
|
||||
byte[] tab = new byte[256];
|
||||
for (int i = 0; i < 256; i++) {
|
||||
byte code = 0;
|
||||
if (Character.isLowerCase(i)) {
|
||||
code |= LOWER;
|
||||
} else if (Character.isUpperCase(i)) {
|
||||
code |= UPPER;
|
||||
} else if (Character.isDigit(i)) {
|
||||
code |= DIGIT;
|
||||
}
|
||||
if (code == 0) {
|
||||
code = SUBWORD_DELIM;
|
||||
}
|
||||
tab[i] = code;
|
||||
}
|
||||
DEFAULT_WORD_DELIM_TABLE = tab;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new WordDelimiterIterator operating with the supplied rules.
|
||||
*
|
||||
* @param charTypeTable table containing character types
|
||||
* @param splitOnCaseChange if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
|
||||
* @param splitOnNumerics if true, causes "j2se" to be three tokens; "j" "2" "se"
|
||||
* @param stemEnglishPossessive if true, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
|
||||
*/
|
||||
WordDelimiterIterator(byte[] charTypeTable, boolean splitOnCaseChange, boolean splitOnNumerics, boolean stemEnglishPossessive) {
|
||||
this.charTypeTable = charTypeTable;
|
||||
this.splitOnCaseChange = splitOnCaseChange;
|
||||
this.splitOnNumerics = splitOnNumerics;
|
||||
this.stemEnglishPossessive = stemEnglishPossessive;
|
||||
}
|
||||
|
||||
/**
|
||||
* Advance to the next subword in the string.
|
||||
*
|
||||
* @return index of the next subword, or {@link #DONE} if all subwords have been returned
|
||||
*/
|
||||
int next() {
|
||||
current = end;
|
||||
if (current == DONE) {
|
||||
return DONE;
|
||||
}
|
||||
|
||||
if (skipPossessive) {
|
||||
current += 2;
|
||||
skipPossessive = false;
|
||||
}
|
||||
|
||||
int lastType = 0;
|
||||
|
||||
while (current < endBounds && (isSubwordDelim(lastType = charType(text[current])))) {
|
||||
current++;
|
||||
}
|
||||
|
||||
if (current >= endBounds) {
|
||||
return end = DONE;
|
||||
}
|
||||
|
||||
for (end = current + 1; end < endBounds; end++) {
|
||||
int type = charType(text[end]);
|
||||
if (isBreak(lastType, type)) {
|
||||
break;
|
||||
}
|
||||
lastType = type;
|
||||
}
|
||||
|
||||
if (end < endBounds - 1 && endsWithPossessive(end + 2)) {
|
||||
skipPossessive = true;
|
||||
}
|
||||
|
||||
return end;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Return the type of the current subword.
|
||||
* This currently uses the type of the first character in the subword.
|
||||
*
|
||||
* @return type of the current word
|
||||
*/
|
||||
int type() {
|
||||
if (end == DONE) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int type = charType(text[current]);
|
||||
switch (type) {
|
||||
// return ALPHA word type for both lower and upper
|
||||
case LOWER:
|
||||
case UPPER:
|
||||
return ALPHA;
|
||||
default:
|
||||
return type;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset the text to a new value, and reset all state
|
||||
*
|
||||
* @param text New text
|
||||
* @param length length of the text
|
||||
*/
|
||||
void setText(char text[], int length) {
|
||||
this.text = text;
|
||||
this.length = this.endBounds = length;
|
||||
current = startBounds = end = 0;
|
||||
skipPossessive = hasFinalPossessive = false;
|
||||
setBounds();
|
||||
}
|
||||
|
||||
// ================================================= Helper Methods ================================================
|
||||
|
||||
/**
|
||||
* Determines whether the transition from lastType to type indicates a break
|
||||
*
|
||||
* @param lastType Last subword type
|
||||
* @param type Current subword type
|
||||
* @return {@code true} if the transition indicates a break, {@code false} otherwise
|
||||
*/
|
||||
private boolean isBreak(int lastType, int type) {
|
||||
if ((type & lastType) != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!splitOnCaseChange && isAlpha(lastType) && isAlpha(type)) {
|
||||
// ALPHA->ALPHA: always ignore if case isn't considered.
|
||||
return false;
|
||||
} else if (isUpper(lastType) && isAlpha(type)) {
|
||||
// UPPER->letter: Don't split
|
||||
return false;
|
||||
} else if (!splitOnNumerics && ((isAlpha(lastType) && isDigit(type)) || (isDigit(lastType) && isAlpha(type)))) {
|
||||
// ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if the current word contains only one subword. Note, it could be potentially surrounded by delimiters
|
||||
*
|
||||
* @return {@code true} if the current word contains only one subword, {@code false} otherwise
|
||||
*/
|
||||
boolean isSingleWord() {
|
||||
if (hasFinalPossessive) {
|
||||
return current == startBounds && end == endBounds - 2;
|
||||
} else {
|
||||
return current == startBounds && end == endBounds;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the internal word bounds (remove leading and trailing delimiters). Note, if a possessive is found, don't remove
|
||||
* it yet, simply note it.
|
||||
*/
|
||||
private void setBounds() {
|
||||
while (startBounds < length && (isSubwordDelim(charType(text[startBounds])))) {
|
||||
startBounds++;
|
||||
}
|
||||
|
||||
while (endBounds > startBounds && (isSubwordDelim(charType(text[endBounds - 1])))) {
|
||||
endBounds--;
|
||||
}
|
||||
if (endsWithPossessive(endBounds)) {
|
||||
hasFinalPossessive = true;
|
||||
}
|
||||
current = startBounds;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if the text at the given position indicates an English possessive which should be removed
|
||||
*
|
||||
* @param pos Position in the text to check if it indicates an English possessive
|
||||
* @return {@code true} if the text at the position indicates an English posessive, {@code false} otherwise
|
||||
*/
|
||||
private boolean endsWithPossessive(int pos) {
|
||||
return (stemEnglishPossessive &&
|
||||
pos > 2 &&
|
||||
text[pos - 2] == '\'' &&
|
||||
(text[pos - 1] == 's' || text[pos - 1] == 'S') &&
|
||||
isAlpha(charType(text[pos - 3])) &&
|
||||
(pos == endBounds || isSubwordDelim(charType(text[pos]))));
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines the type of the given character
|
||||
*
|
||||
* @param ch Character whose type is to be determined
|
||||
* @return Type of the character
|
||||
*/
|
||||
private int charType(int ch) {
|
||||
if (ch < charTypeTable.length) {
|
||||
return charTypeTable[ch];
|
||||
}
|
||||
return getType(ch);
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the type of the given character
|
||||
*
|
||||
* @param ch Character whose type is to be determined
|
||||
* @return Type of the character
|
||||
*/
|
||||
public static byte getType(int ch) {
|
||||
switch (Character.getType(ch)) {
|
||||
case Character.UPPERCASE_LETTER:
|
||||
return UPPER;
|
||||
case Character.LOWERCASE_LETTER:
|
||||
return LOWER;
|
||||
|
||||
case Character.TITLECASE_LETTER:
|
||||
case Character.MODIFIER_LETTER:
|
||||
case Character.OTHER_LETTER:
|
||||
case Character.NON_SPACING_MARK:
|
||||
case Character.ENCLOSING_MARK: // depends what it encloses?
|
||||
case Character.COMBINING_SPACING_MARK:
|
||||
return ALPHA;
|
||||
|
||||
case Character.DECIMAL_DIGIT_NUMBER:
|
||||
case Character.LETTER_NUMBER:
|
||||
case Character.OTHER_NUMBER:
|
||||
return DIGIT;
|
||||
|
||||
// case Character.SPACE_SEPARATOR:
|
||||
// case Character.LINE_SEPARATOR:
|
||||
// case Character.PARAGRAPH_SEPARATOR:
|
||||
// case Character.CONTROL:
|
||||
// case Character.FORMAT:
|
||||
// case Character.PRIVATE_USE:
|
||||
|
||||
case Character.SURROGATE: // prevent splitting
|
||||
return ALPHA | DIGIT;
|
||||
|
||||
// case Character.DASH_PUNCTUATION:
|
||||
// case Character.START_PUNCTUATION:
|
||||
// case Character.END_PUNCTUATION:
|
||||
// case Character.CONNECTOR_PUNCTUATION:
|
||||
// case Character.OTHER_PUNCTUATION:
|
||||
// case Character.MATH_SYMBOL:
|
||||
// case Character.CURRENCY_SYMBOL:
|
||||
// case Character.MODIFIER_SYMBOL:
|
||||
// case Character.OTHER_SYMBOL:
|
||||
// case Character.INITIAL_QUOTE_PUNCTUATION:
|
||||
// case Character.FINAL_QUOTE_PUNCTUATION:
|
||||
|
||||
default:
|
||||
return SUBWORD_DELIM;
|
||||
}
|
||||
}
|
||||
}
|
@ -48,14 +48,17 @@ import org.apache.lucene.analysis.sv.SwedishAnalyzer;
|
||||
import org.apache.lucene.analysis.tr.TurkishAnalyzer;
|
||||
import org.elasticsearch.ElasticSearchIllegalArgumentException;
|
||||
import org.elasticsearch.common.Strings;
|
||||
import org.elasticsearch.common.base.Charsets;
|
||||
import org.elasticsearch.common.collect.ImmutableMap;
|
||||
import org.elasticsearch.common.collect.ImmutableSet;
|
||||
import org.elasticsearch.common.collect.Iterators;
|
||||
import org.elasticsearch.common.collect.MapBuilder;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.URL;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
@ -148,27 +151,22 @@ public class Analysis {
|
||||
* @throws ElasticSearchIllegalArgumentException
|
||||
* If the word list cannot be found at either key.
|
||||
*/
|
||||
public static Set<String> getWordList(Settings settings, String settingPrefix) {
|
||||
public static Set<String> getWordList(Environment env, Settings settings, String settingPrefix) {
|
||||
String wordListPath = settings.get(settingPrefix + "_path", null);
|
||||
|
||||
if (wordListPath == null) {
|
||||
String[] explicitWordList = settings.getAsArray(settingPrefix, null);
|
||||
if (explicitWordList == null) {
|
||||
String message = String.format("%s or %s_path must be provided.", settingPrefix, settingPrefix);
|
||||
throw new ElasticSearchIllegalArgumentException(message);
|
||||
return null;
|
||||
} else {
|
||||
|
||||
return new HashSet<String>(Arrays.asList(explicitWordList));
|
||||
}
|
||||
}
|
||||
|
||||
File wordListFile = new File(wordListPath);
|
||||
if (!wordListFile.exists()) {
|
||||
throw new ElasticSearchIllegalArgumentException(settingPrefix + "_path file must exist.");
|
||||
}
|
||||
URL wordListFile = env.resolveConfig(wordListPath);
|
||||
|
||||
try {
|
||||
return WordlistLoader.getWordSet(wordListFile);
|
||||
return WordlistLoader.getWordSet(new InputStreamReader(wordListFile.openStream(), Charsets.UTF_8), "#");
|
||||
} catch (IOException ioe) {
|
||||
String message = String.format("IOException while reading %s_path: %s", settingPrefix, ioe.getMessage());
|
||||
throw new ElasticSearchIllegalArgumentException(message);
|
||||
|
@ -361,6 +361,7 @@ public class AnalysisModule extends AbstractModule {
|
||||
@Override public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
|
||||
tokenFiltersBindings.processTokenFilter("snowball", SnowballTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("stemmer", StemmerTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("word_delimiter", WordDelimiterTokenFilterFactory.class);
|
||||
|
||||
tokenFiltersBindings.processTokenFilter("arabic_stem", ArabicStemTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("brazilian_stem", BrazilianStemTokenFilterFactory.class);
|
||||
|
@ -0,0 +1,196 @@
|
||||
/*
|
||||
* Licensed to Elastic Search and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. Elastic Search licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
import org.elasticsearch.common.lucene.Lucene;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.settings.IndexSettings;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private final byte[] charTypeTable;
|
||||
private final boolean generateWordParts;
|
||||
private final boolean generateNumberParts;
|
||||
private final boolean catenateWords;
|
||||
private final boolean catenateNumbers;
|
||||
private final boolean catenateAll;
|
||||
private final boolean splitOnCaseChange;
|
||||
private final boolean preserveOriginal;
|
||||
private final boolean splitOnNumerics;
|
||||
private final boolean stemEnglishPossessive;
|
||||
private final CharArraySet protoWords;
|
||||
|
||||
@Inject public WordDelimiterTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
|
||||
super(index, indexSettings, name, settings);
|
||||
|
||||
// Sample Format for the type table:
|
||||
// $ => DIGIT
|
||||
// % => DIGIT
|
||||
// . => DIGIT
|
||||
// \u002C => DIGIT
|
||||
// \u200D => ALPHANUM
|
||||
Set<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
|
||||
if (charTypeTableValues == null) {
|
||||
this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
|
||||
} else {
|
||||
this.charTypeTable = parseTypes(charTypeTableValues);
|
||||
}
|
||||
|
||||
// If 1, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
|
||||
this.generateWordParts = settings.getAsBoolean("generate_word_parts", true);
|
||||
// If 1, causes number subwords to be generated: "500-42" => "500" "42"
|
||||
this.generateNumberParts = settings.getAsBoolean("generate_number_parts", true);
|
||||
// 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
|
||||
this.catenateWords = settings.getAsBoolean("catenate_words", false);
|
||||
// If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042"
|
||||
this.catenateNumbers = settings.getAsBoolean("catenate_numbers", false);
|
||||
// If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
|
||||
this.catenateAll = settings.getAsBoolean("catenate_all", false);
|
||||
// 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
|
||||
this.splitOnCaseChange = settings.getAsBoolean("split_on_case_change", true);
|
||||
// If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
|
||||
this.preserveOriginal = settings.getAsBoolean("preserve_original", false);
|
||||
// 1, causes "j2se" to be three tokens; "j" "2" "se"
|
||||
this.splitOnNumerics = settings.getAsBoolean("split_on_numerics", true);
|
||||
// If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
|
||||
this.stemEnglishPossessive = settings.getAsBoolean("stem_english_possessive", true);
|
||||
// If not null is the set of tokens to protect from being delimited
|
||||
Set<String> protectedWords = Analysis.getWordList(env, settings, "protected_words");
|
||||
this.protoWords = protectedWords == null ? null : CharArraySet.copy(Lucene.VERSION, protectedWords);
|
||||
}
|
||||
|
||||
@Override public TokenStream create(TokenStream tokenStream) {
|
||||
return new WordDelimiterFilter(tokenStream,
|
||||
WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE,
|
||||
generateWordParts ? 1 : 0,
|
||||
generateNumberParts ? 1 : 0,
|
||||
catenateWords ? 1 : 0,
|
||||
catenateNumbers ? 1 : 0,
|
||||
catenateAll ? 1 : 0,
|
||||
splitOnCaseChange ? 1 : 0,
|
||||
preserveOriginal ? 1 : 0,
|
||||
splitOnNumerics ? 1 : 0,
|
||||
stemEnglishPossessive ? 1 : 0,
|
||||
protoWords);
|
||||
}
|
||||
|
||||
// source => type
|
||||
private static Pattern typePattern = Pattern.compile("(.*)\\s*=>\\s*(.*)\\s*$");
|
||||
|
||||
/**
|
||||
* parses a list of MappingCharFilter style rules into a custom byte[] type table
|
||||
*/
|
||||
private byte[] parseTypes(Collection<String> rules) {
|
||||
SortedMap<Character, Byte> typeMap = new TreeMap<Character, Byte>();
|
||||
for (String rule : rules) {
|
||||
Matcher m = typePattern.matcher(rule);
|
||||
if (!m.find())
|
||||
throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]");
|
||||
String lhs = parseString(m.group(1).trim());
|
||||
Byte rhs = parseType(m.group(2).trim());
|
||||
if (lhs.length() != 1)
|
||||
throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed.");
|
||||
if (rhs == null)
|
||||
throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal type.");
|
||||
typeMap.put(lhs.charAt(0), rhs);
|
||||
}
|
||||
|
||||
// ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance
|
||||
byte types[] = new byte[Math.max(typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)];
|
||||
for (int i = 0; i < types.length; i++)
|
||||
types[i] = WordDelimiterIterator.getType(i);
|
||||
for (Map.Entry<Character, Byte> mapping : typeMap.entrySet())
|
||||
types[mapping.getKey()] = mapping.getValue();
|
||||
return types;
|
||||
}
|
||||
|
||||
private Byte parseType(String s) {
|
||||
if (s.equals("LOWER"))
|
||||
return WordDelimiterFilter.LOWER;
|
||||
else if (s.equals("UPPER"))
|
||||
return WordDelimiterFilter.UPPER;
|
||||
else if (s.equals("ALPHA"))
|
||||
return WordDelimiterFilter.ALPHA;
|
||||
else if (s.equals("DIGIT"))
|
||||
return WordDelimiterFilter.DIGIT;
|
||||
else if (s.equals("ALPHANUM"))
|
||||
return WordDelimiterFilter.ALPHANUM;
|
||||
else if (s.equals("SUBWORD_DELIM"))
|
||||
return WordDelimiterFilter.SUBWORD_DELIM;
|
||||
else
|
||||
return null;
|
||||
}
|
||||
|
||||
char[] out = new char[256];
|
||||
|
||||
private String parseString(String s) {
|
||||
int readPos = 0;
|
||||
int len = s.length();
|
||||
int writePos = 0;
|
||||
while (readPos < len) {
|
||||
char c = s.charAt(readPos++);
|
||||
if (c == '\\') {
|
||||
if (readPos >= len)
|
||||
throw new RuntimeException("Invalid escaped char in [" + s + "]");
|
||||
c = s.charAt(readPos++);
|
||||
switch (c) {
|
||||
case '\\':
|
||||
c = '\\';
|
||||
break;
|
||||
case 'n':
|
||||
c = '\n';
|
||||
break;
|
||||
case 't':
|
||||
c = '\t';
|
||||
break;
|
||||
case 'r':
|
||||
c = '\r';
|
||||
break;
|
||||
case 'b':
|
||||
c = '\b';
|
||||
break;
|
||||
case 'f':
|
||||
c = '\f';
|
||||
break;
|
||||
case 'u':
|
||||
if (readPos + 3 >= len)
|
||||
throw new RuntimeException("Invalid escaped char in [" + s + "]");
|
||||
c = (char) Integer.parseInt(s.substring(readPos, readPos + 4), 16);
|
||||
readPos += 4;
|
||||
break;
|
||||
}
|
||||
}
|
||||
out[writePos++] = c;
|
||||
}
|
||||
return new String(out, 0, writePos);
|
||||
}
|
||||
}
|
@ -20,9 +20,11 @@
|
||||
package org.elasticsearch.index.analysis.compound;
|
||||
|
||||
import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase;
|
||||
import org.elasticsearch.ElasticSearchIllegalArgumentException;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.Analysis;
|
||||
@ -44,13 +46,16 @@ public abstract class AbstractCompoundWordTokenFilterFactory extends AbstractTok
|
||||
protected final boolean onlyLongestMatch;
|
||||
protected final Set<String> wordList;
|
||||
|
||||
@Inject public AbstractCompoundWordTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
|
||||
@Inject public AbstractCompoundWordTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
|
||||
super(index, indexSettings, name, settings);
|
||||
|
||||
minWordSize = settings.getAsInt("min_word_size", CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE);
|
||||
minSubwordSize = settings.getAsInt("min_subword_size", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
|
||||
maxSubwordSize = settings.getAsInt("max_subword_size", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
|
||||
onlyLongestMatch = settings.getAsBoolean("only_longest_max", false);
|
||||
wordList = Analysis.getWordList(settings, "word_list");
|
||||
wordList = Analysis.getWordList(env, settings, "word_list");
|
||||
if (wordList == null) {
|
||||
throw new ElasticSearchIllegalArgumentException("word_list must be provided for [" + name + "], either as a path to a file, or directly");
|
||||
}
|
||||
}
|
||||
}
|
@ -24,6 +24,7 @@ import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.analysis.AnalysisSettingsRequired;
|
||||
import org.elasticsearch.index.settings.IndexSettings;
|
||||
@ -39,8 +40,8 @@ import org.elasticsearch.index.settings.IndexSettings;
|
||||
@AnalysisSettingsRequired
|
||||
public class DictionaryCompoundWordTokenFilterFactory extends AbstractCompoundWordTokenFilterFactory {
|
||||
|
||||
@Inject public DictionaryCompoundWordTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
|
||||
super(index, indexSettings, name, settings);
|
||||
@Inject public DictionaryCompoundWordTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
|
||||
super(index, indexSettings, env, name, settings);
|
||||
}
|
||||
|
||||
@Override public TokenStream create(TokenStream tokenStream) {
|
||||
|
@ -26,11 +26,13 @@ import org.elasticsearch.ElasticSearchIllegalArgumentException;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.analysis.AnalysisSettingsRequired;
|
||||
import org.elasticsearch.index.settings.IndexSettings;
|
||||
import org.xml.sax.InputSource;
|
||||
|
||||
import java.io.File;
|
||||
import java.net.URL;
|
||||
|
||||
/**
|
||||
* Uses the {@link org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter} to decompound tokens based on hyphenation rules.
|
||||
@ -44,21 +46,18 @@ public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundW
|
||||
|
||||
private final HyphenationTree hyphenationTree;
|
||||
|
||||
@Inject public HyphenationCompoundWordTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
|
||||
super(index, indexSettings, name, settings);
|
||||
@Inject public HyphenationCompoundWordTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
|
||||
super(index, indexSettings, env, name, settings);
|
||||
|
||||
String hyphenationPatternsPath = settings.get("hyphenation_patterns_path", null);
|
||||
if (hyphenationPatternsPath == null) {
|
||||
throw new ElasticSearchIllegalArgumentException("hyphenation_patterns_path is a required setting.");
|
||||
}
|
||||
|
||||
File hyphenationPatternsFile = new File(hyphenationPatternsPath);
|
||||
if (!hyphenationPatternsFile.exists()) {
|
||||
throw new ElasticSearchIllegalArgumentException("hyphenation_patterns_path file must exist.");
|
||||
}
|
||||
URL hyphenationPatternsFile = env.resolveConfig(hyphenationPatternsPath);
|
||||
|
||||
try {
|
||||
hyphenationTree = HyphenationCompoundWordTokenFilter.getHyphenationTree(hyphenationPatternsFile);
|
||||
hyphenationTree = HyphenationCompoundWordTokenFilter.getHyphenationTree(new InputSource(hyphenationPatternsFile.toExternalForm()));
|
||||
} catch (Exception e) {
|
||||
throw new ElasticSearchIllegalArgumentException("Exception while reading hyphenation_patterns_path: " + e.getMessage());
|
||||
}
|
||||
|
@ -24,7 +24,10 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.elasticsearch.common.inject.Injector;
|
||||
import org.elasticsearch.common.inject.ModulesBuilder;
|
||||
import org.elasticsearch.common.lucene.analysis.HTMLStripCharFilter;
|
||||
import org.elasticsearch.common.settings.ImmutableSettings;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.env.EnvironmentModule;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.IndexNameModule;
|
||||
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
|
||||
@ -61,6 +64,7 @@ public class AnalysisModuleTests {
|
||||
private void testSimpleConfiguration(Settings settings) {
|
||||
Index index = new Index("test");
|
||||
Injector injector = new ModulesBuilder().add(
|
||||
new EnvironmentModule(new Environment(settings)),
|
||||
new IndexSettingsModule(index, settings),
|
||||
new IndexNameModule(index),
|
||||
new AnalysisModule(settings)).createInjector();
|
||||
@ -120,18 +124,19 @@ public class AnalysisModuleTests {
|
||||
assertThat(dictionaryDecompounderAnalyze.tokenFilters().length, equalTo(1));
|
||||
assertThat(dictionaryDecompounderAnalyze.tokenFilters()[0], instanceOf(DictionaryCompoundWordTokenFilterFactory.class));
|
||||
|
||||
Set<String> wordList = Analysis.getWordList(settings, "index.analysis.filter.dict_dec.word_list");
|
||||
Set<String> wordList = Analysis.getWordList(null, settings, "index.analysis.filter.dict_dec.word_list");
|
||||
MatcherAssert.assertThat(wordList.size(), equalTo(6));
|
||||
MatcherAssert.assertThat(wordList, hasItems("donau", "dampf", "schiff", "spargel", "creme", "suppe"));
|
||||
}
|
||||
|
||||
@Test public void testWordListPath() throws Exception {
|
||||
Environment env = new Environment(ImmutableSettings.Builder.EMPTY_SETTINGS);
|
||||
String[] words = new String[]{"donau", "dampf", "schiff", "spargel", "creme", "suppe"};
|
||||
|
||||
File wordListFile = generateWordList(words);
|
||||
Settings settings = settingsBuilder().loadFromSource("index: \n word_list_path: " + wordListFile.getAbsolutePath()).build();
|
||||
|
||||
Set<String> wordList = Analysis.getWordList(settings, "index.word_list");
|
||||
Set<String> wordList = Analysis.getWordList(env, settings, "index.word_list");
|
||||
MatcherAssert.assertThat(wordList.size(), equalTo(6));
|
||||
MatcherAssert.assertThat(wordList, hasItems(words));
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user