diff --git a/pom.xml b/pom.xml
index 05808717046..ba1ec8d5ab7 100644
--- a/pom.xml
+++ b/pom.xml
@@ -30,7 +30,7 @@
- 3.5.0
+ 3.6.0
diff --git a/src/main/java/org/apache/lucene/index/memory/CustomMemoryIndex.java b/src/main/java/org/apache/lucene/index/memory/CustomMemoryIndex.java
index 3954d5d0758..01617403751 100644
--- a/src/main/java/org/apache/lucene/index/memory/CustomMemoryIndex.java
+++ b/src/main/java/org/apache/lucene/index/memory/CustomMemoryIndex.java
@@ -46,10 +46,10 @@ import org.apache.lucene.search.*;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Constants;
+import org.elasticsearch.common.io.FastStringReader;
import java.io.IOException;
import java.io.Serializable;
-import java.io.StringReader;
import java.util.*;
/**
@@ -201,6 +201,8 @@ public class CustomMemoryIndex implements Serializable {
private static final boolean DEBUG = false;
+ private final FieldInfos fieldInfos;
+
/**
* Sorts term entries into ascending order; also works for
* Arrays.binarySearch() and Arrays.sort()
@@ -234,6 +236,7 @@ public class CustomMemoryIndex implements Serializable {
*/
private CustomMemoryIndex(boolean storeOffsets) {
this.stride = storeOffsets ? 3 : 1;
+ fieldInfos = new FieldInfos();
}
/**
@@ -257,8 +260,12 @@ public class CustomMemoryIndex implements Serializable {
if (analyzer == null)
throw new IllegalArgumentException("analyzer must not be null");
- TokenStream stream = analyzer.tokenStream(fieldName,
- new StringReader(text));
+ TokenStream stream;
+ try {
+ stream = analyzer.reusableTokenStream(fieldName, new FastStringReader(text));
+ } catch (IOException ex) {
+ throw new RuntimeException(ex);
+ }
addField(fieldName, stream);
}
@@ -338,6 +345,8 @@ public class CustomMemoryIndex implements Serializable {
int numOverlapTokens = 0;
int pos = -1;
+ fieldInfos.add(fieldName, true, true);
+
// CHANGE
if (fields.get(fieldName) != null) {
Info info = fields.get(fieldName);
@@ -760,13 +769,12 @@ public class CustomMemoryIndex implements Serializable {
* Search support for Lucene framework integration; implements all methods
* required by the Lucene IndexReader contracts.
*/
- private final class MemoryIndexReader extends IndexReader {
+ final class MemoryIndexReader extends IndexReader {
private Searcher searcher; // needed to find searcher.getSimilarity()
private MemoryIndexReader() {
super(); // avoid as much superclass baggage as possible
- readerFinishedListeners = Collections.synchronizedSet(new HashSet());
}
private Info getInfo(String fieldName) {
@@ -1174,12 +1182,6 @@ public class CustomMemoryIndex implements Serializable {
return 1;
}
- @Override
- public Document document(int n) {
- if (DEBUG) System.err.println("MemoryIndexReader.document");
- return new Document(); // there are no stored fields
- }
-
//When we convert to JDK 1.5 make this Set
@Override
public Document document(int n, FieldSelector fieldSelector) throws IOException {
@@ -1219,20 +1221,9 @@ public class CustomMemoryIndex implements Serializable {
if (DEBUG) System.err.println("MemoryIndexReader.doClose");
}
- // lucene >= 1.9 (remove this method for lucene-1.4.3)
@Override
- public Collection getFieldNames(FieldOption fieldOption) {
- if (DEBUG) System.err.println("MemoryIndexReader.getFieldNamesOption");
- if (fieldOption == FieldOption.UNINDEXED)
- return Collections.emptySet();
- if (fieldOption == FieldOption.INDEXED_NO_TERMVECTOR)
- return Collections.emptySet();
- if (fieldOption == FieldOption.TERMVECTOR_WITH_OFFSET && stride == 1)
- return Collections.emptySet();
- if (fieldOption == FieldOption.TERMVECTOR_WITH_POSITION_OFFSET && stride == 1)
- return Collections.emptySet();
-
- return Collections.unmodifiableSet(fields.keySet());
+ public FieldInfos getFieldInfos() {
+ return fieldInfos;
}
}
diff --git a/src/main/java/org/elasticsearch/common/lucene/Lucene.java b/src/main/java/org/elasticsearch/common/lucene/Lucene.java
index a3f6b980bd5..be09a3bd924 100644
--- a/src/main/java/org/elasticsearch/common/lucene/Lucene.java
+++ b/src/main/java/org/elasticsearch/common/lucene/Lucene.java
@@ -40,7 +40,7 @@ import java.lang.reflect.Field;
*/
public class Lucene {
- public static final Version VERSION = Version.LUCENE_35;
+ public static final Version VERSION = Version.LUCENE_36;
public static final Version ANALYZER_VERSION = VERSION;
public static final Version QUERYPARSER_VERSION = VERSION;
@@ -55,6 +55,9 @@ public class Lucene {
if (version == null) {
return defaultVersion;
}
+ if ("3.6".equals(version)) {
+ return Version.LUCENE_36;
+ }
if ("3.5".equals(version)) {
return Version.LUCENE_35;
}
diff --git a/src/main/java/org/elasticsearch/common/lucene/analysis/HTMLStripCharFilter.java b/src/main/java/org/elasticsearch/common/lucene/analysis/HTMLStripCharFilter.java
deleted file mode 100644
index d6ab8e359cb..00000000000
--- a/src/main/java/org/elasticsearch/common/lucene/analysis/HTMLStripCharFilter.java
+++ /dev/null
@@ -1,1373 +0,0 @@
-/*
- * Licensed to ElasticSearch and Shay Banon under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. ElasticSearch licenses this
- * file to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.elasticsearch.common.lucene.analysis;
-
-import org.apache.lucene.analysis.BaseCharFilter;
-import org.apache.lucene.analysis.CharReader;
-import org.apache.lucene.analysis.CharStream;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.util.HashMap;
-import java.util.Set;
-
-/**
- *
- */
-// LUCENE MONITOR: Once the next Lucene version is out, use the built in HTML filter
-public class HTMLStripCharFilter extends BaseCharFilter {
- private int readAheadLimit = DEFAULT_READ_AHEAD;
- private int safeReadAheadLimit = readAheadLimit - 3;
- private int numWhitespace = 0;
- private int numRead = 0;
- private int numEaten = 0;
- private int numReturned = 0;
- private int lastMark;
- private Set escapedTags;
-
- // pushback buffer
- private final StringBuilder pushed = new StringBuilder();
- private static final int EOF = -1;
- private static final int MISMATCH = -2;
-
- private static final int MATCH = -3;
- // temporary buffer
- private final StringBuilder sb = new StringBuilder();
- public static final int DEFAULT_READ_AHEAD = 8192;
-
-
- public static void main(String[] args) throws IOException {
- Reader in = new HTMLStripCharFilter(
- CharReader.get(new InputStreamReader(System.in)));
- int ch;
- while ((ch = in.read()) != -1) System.out.print((char) ch);
- }
-
- public HTMLStripCharFilter(CharStream source) {
- super(source.markSupported() ? source : CharReader.get(new BufferedReader(source)));
- }
-
- public HTMLStripCharFilter(CharStream source, Set escapedTags) {
- this(source);
- this.escapedTags = escapedTags;
- }
-
- public HTMLStripCharFilter(CharStream source, Set escapedTags, int readAheadLimit) {
- this(source);
- this.escapedTags = escapedTags;
- this.readAheadLimit = readAheadLimit;
- safeReadAheadLimit = readAheadLimit - 3;
- }
-
- public int getReadAheadLimit() {
- return readAheadLimit;
- }
-
- private int next() throws IOException {
- int len = pushed.length();
- if (len > 0) {
- int ch = pushed.charAt(len - 1);
- pushed.setLength(len - 1);
- return ch;
- }
- numRead++;
- return input.read();
- }
-
- private int nextSkipWS() throws IOException {
- int ch = next();
- while (isSpace(ch)) ch = next();
- return ch;
- }
-
- private int peek() throws IOException {
- int len = pushed.length();
- if (len > 0) {
- return pushed.charAt(len - 1);
- }
- int ch = input.read();
- push(ch);
- return ch;
- }
-
- private void push(int ch) {
- pushed.append((char) ch);
- }
-
-
- private boolean isSpace(int ch) {
- switch (ch) {
- case ' ':
- case '\n':
- case '\r':
- case '\t':
- return true;
- default:
- return false;
- }
- }
-
- private boolean isHex(int ch) {
- return (ch >= '0' && ch <= '9') ||
- (ch >= 'A' && ch <= 'Z') ||
- (ch >= 'a' && ch <= 'z');
- }
-
- private boolean isAlpha(int ch) {
- return ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z';
- }
-
- private boolean isDigit(int ch) {
- return ch >= '0' && ch <= '9';
- }
-
- /**
- * From HTML 4.0
- * [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender
- * [5] Name ::= (Letter | '_' | ':') (NameChar)*
- * [6] Names ::= Name (#x20 Name)*
- * [7] Nmtoken ::= (NameChar)+
- * [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
- * *
- */
-
- // should I include all id chars allowable by HTML/XML here?
- // including accented chars, ':', etc?
- private boolean isIdChar(int ch) {
- // return Character.isUnicodeIdentifierPart(ch);
- // isUnicodeIdentiferPart doesn't include '-'... shoudl I still
- // use it and add in '-',':',etc?
- return isAlpha(ch) || isDigit(ch) || ch == '.' ||
- ch == '-' || ch == '_' || ch == ':'
- || Character.isLetter(ch);
-
- }
-
- private boolean isFirstIdChar(int ch) {
- return Character.isUnicodeIdentifierStart(ch);
- // return isAlpha(ch) || ch=='_' || Character.isLetter(ch);
- }
-
-
- private void saveState() throws IOException {
- lastMark = numRead;
- input.mark(readAheadLimit);
- }
-
- private void restoreState() throws IOException {
- input.reset();
- pushed.setLength(0);
- }
-
- private int readNumericEntity() throws IOException {
- // "" has already been read at this point
- int eaten = 2;
-
- // is this decimal, hex, or nothing at all.
- int ch = next();
- int base = 10;
- boolean invalid = false;
- sb.setLength(0);
-
- if (isDigit(ch)) {
- // decimal character entity
- sb.append((char) ch);
- for (int i = 0; i < 10; i++) {
- ch = next();
- if (isDigit(ch)) {
- sb.append((char) ch);
- } else {
- break;
- }
- }
- } else if (ch == 'x') {
- eaten++;
- // hex character entity
- base = 16;
- sb.setLength(0);
- for (int i = 0; i < 10; i++) {
- ch = next();
- if (isHex(ch)) {
- sb.append((char) ch);
- } else {
- break;
- }
- }
- } else {
- return MISMATCH;
- }
-
-
- // In older HTML, an entity may not have always been terminated
- // with a semicolon. We'll also treat EOF or whitespace as terminating
- // the entity.
- try {
- if (ch == ';' || ch == -1) {
- // do not account for the eaten ";" due to the fact that we do output a char
- numWhitespace = sb.length() + eaten;
- return Integer.parseInt(sb.toString(), base);
- }
-
- // if whitespace terminated the entity, we need to return
- // that whitespace on the next call to read().
- if (isSpace(ch)) {
- push(ch);
- numWhitespace = sb.length() + eaten;
- return Integer.parseInt(sb.toString(), base);
- }
- } catch (NumberFormatException e) {
- return MISMATCH;
- }
-
- // Not an entity...
- return MISMATCH;
- }
-
- private int readEntity() throws IOException {
- int ch = next();
- if (ch == '#') return readNumericEntity();
-
- //read an entity reference
-
- // for an entity reference, require the ';' for safety.
- // otherwise we may try and convert part of some company
- // names to an entity. "Alpha&Beta Corp" for instance.
- //
- // TODO: perhaps I should special case some of the
- // more common ones like & to make the ';' optional...
-
- sb.setLength(0);
- sb.append((char) ch);
-
- for (int i = 0; i < safeReadAheadLimit; i++) {
- ch = next();
- if (Character.isLetter(ch)) {
- sb.append((char) ch);
- } else {
- break;
- }
- }
-
- if (ch == ';') {
- String entity = sb.toString();
- Character entityChar = entityTable.get(entity);
- if (entityChar != null) {
- numWhitespace = entity.length() + 1;
- return entityChar.charValue();
- }
- }
-
- return MISMATCH;
- }
-
- /**
- * valid comments according to HTML specs
- *
- *
- *
- *
- *
- * Hello -->
- *
- * #comments inside of an entity decl:
- *
- *
- * Turns out, IE & mozilla don't parse comments correctly.
- * Since this is meant to be a practical stripper, I'll just
- * try and duplicate what the browsers do.
- *
- *
- *
- *
- *
- *
- * *
- */
-
- private int readBang(boolean inScript) throws IOException {
- // at this point, "') {
-
- int ch = next();
- if (ch == '>') return MATCH;
-
- // if it starts with "
- //since we did readComment already, it may be the case that we are already deep into the read ahead buffer
- //so, we may need to abort sooner
- while ((numRead - lastMark) < safeReadAheadLimit) {
- ch = next();
- if (ch == '>') {
- return MATCH;
- } else if (ch < 0) {
- return MISMATCH;
- }
- }
- }
- return MISMATCH;
- }
-
- // tries to read comments the way browsers do, not
- // strictly by the standards.
- //
- // GRRRR. it turns out that in the wild, a
- //
-
- private int readComment(boolean inScript) throws IOException {
- // at this point "') {
- push(ch);
- push('-');
- continue;
- }
-
- return MATCH;
- } else if ((ch == '\'' || ch == '"') && inScript) {
- push(ch);
- int ret = readScriptString();
- // if this wasn't a string, there's not much we can do
- // at this point without having a stack of stream states in
- // order to "undo" just the latest.
- } else if (ch == '<') {
- eatSSI();
- }
-
- }
- return MISMATCH;
-
- }
-
-
- private int readTag() throws IOException {
- // at this point '<' has already been read
- int ch = next();
- if (!isAlpha(ch)) {
- push(ch);
- return MISMATCH;
- }
-
- sb.setLength(0);
- sb.append((char) ch);
- while ((numRead - lastMark) < safeReadAheadLimit) {
-
- ch = next();
- if (isIdChar(ch)) {
- sb.append((char) ch);
- } else if (ch == '/') {
- // Hmmm, a tag can close with "/>" as well as "/ >"
- // read end tag '/>' or '/ >', etc
- return nextSkipWS() == '>' ? MATCH : MISMATCH;
- } else {
- break;
- }
- }
- if (escapedTags != null && escapedTags.contains(sb.toString())) {
- //if this is a reservedTag, then keep it
- return MISMATCH;
- }
- // After the tag id, there needs to be either whitespace or
- // '>'
- if (!(ch == '>' || isSpace(ch))) {
- return MISMATCH;
- }
-
- if (ch != '>') {
- // process attributes
- while ((numRead - lastMark) < safeReadAheadLimit) {
- ch = next();
- if (isSpace(ch)) {
- continue;
- } else if (isFirstIdChar(ch)) {
- push(ch);
- int ret = readAttr2();
- if (ret == MISMATCH) return ret;
- } else if (ch == '/') {
- // read end tag '/>' or '/ >', etc
- return nextSkipWS() == '>' ? MATCH : MISMATCH;
- } else if (ch == '>') {
- break;
- } else {
- return MISMATCH;
- }
-
- }
- if ((numRead - lastMark) >= safeReadAheadLimit) {
- return MISMATCH;//exit out if we exceeded the buffer
- }
- }
-
- // We only get to this point after we have read the
- // entire tag. Now let's see if it's a special tag.
- String name = sb.toString();
- if (name.equalsIgnoreCase("script") || name.equalsIgnoreCase("style")) {
- // The content of script and style elements is
- // CDATA in HTML 4 but PCDATA in XHTML.
-
- /* From HTML4:
- Although the STYLE and SCRIPT elements use CDATA for their data model,
- for these elements, CDATA must be handled differently by user agents.
- Markup and entities must be treated as raw text and passed to the application
- as is. The first occurrence of the character sequence "" (end-tag open
- delimiter) is treated as terminating the end of the element's content. In
- valid documents, this would be the end tag for the element.
- */
-
- // discard everything until endtag is hit (except
- // if it occurs in a comment.
-
- // reset the stream mark to here, since we know that we sucessfully matched
- // a tag, and if we can't find the end tag, this is where we will want
- // to roll back to.
- saveState();
- pushed.setLength(0);
- return findEndTag();
- }
- return MATCH;
- }
-
-
- // find an end tag, but beware of comments...
- // -->foo
- // beware markup in script strings: ...document.write("")foo
- // TODO: do I need to worry about CDATA sections "') return MISMATCH;
- return MATCH;
- } else if (ch == '\'' || ch == '"') {
- // read javascript string to avoid a false match.
- push(ch);
- int ret = readScriptString();
- // what to do about a non-match (non-terminated string?)
- // play it safe and index the rest of the data I guess...
- if (ret == MISMATCH) return MISMATCH;
- } else if (ch < 0) {
- return MISMATCH;
- }
-
- }
- return MISMATCH;
- }
-
-
- // read a string escaped by backslashes
-
- private int readScriptString() throws IOException {
- int quoteChar = next();
- if (quoteChar != '\'' && quoteChar != '"') return MISMATCH;
-
- while ((numRead - lastMark) < safeReadAheadLimit) {
- int ch = next();
- if (ch == quoteChar) return MATCH;
- else if (ch == '\\') {
- ch = next();
- } else if (ch < 0) {
- return MISMATCH;
- } else if (ch == '<') {
- eatSSI();
- }
-
- }
- return MISMATCH;
- }
-
-
- private int readName(boolean checkEscaped) throws IOException {
- StringBuilder builder = (checkEscaped && escapedTags != null) ? new StringBuilder() : null;
- int ch = next();
- if (builder != null) builder.append((char) ch);
- if (!isFirstIdChar(ch)) return MISMATCH;
- ch = next();
- if (builder != null) builder.append((char) ch);
- while (isIdChar(ch)) {
- ch = next();
- if (builder != null) builder.append((char) ch);
- }
- if (ch != -1) {
- push(ch);
-
- }
- //strip off the trailing >
- if (builder != null && escapedTags.contains(builder.substring(0, builder.length() - 1))) {
- return MISMATCH;
- }
- return MATCH;
- }
-
- /**
- * [10] AttValue ::= '"' ([^<&"] | Reference)* '"'
- * | "'" ([^<&'] | Reference)* "'"
- *
- * need to also handle unquoted attributes, and attributes w/o values:
- *