LUCENE-589, LUCENE-2246: fix intl bugs in contrib/demo

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1031460 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-11-05 07:33:05 +00:00
parent 863dfccc76
commit 048cdb57f4
11 changed files with 261 additions and 44 deletions

View File

@ -134,6 +134,12 @@ Bug fixes
* LUCENE-2616: FastVectorHighlighter: out of alignment when the first value is
empty in multiValued field (Koji Sekiguchi)
* LUCENE-589: Fix contrib/demo for international documents.
(Curtis d'Entremont via Robert Muir)
* LUCENE-2246: Fix contrib/demo for Turkish html documents.
(Selim Nadi via Robert Muir)
API Changes

View File

@ -18,7 +18,9 @@ package org.apache.lucene.demo;
*/
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.InputStreamReader;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
@ -40,7 +42,7 @@ public class FileDocument {
Reader field;
*/
public static Document Document(File f)
throws java.io.FileNotFoundException {
throws java.io.IOException {
// make a new, empty document
Document doc = new Document();
@ -58,9 +60,9 @@ public class FileDocument {
// Add the contents of the file to a field named "contents". Specify a Reader,
// so that the text of the file is tokenized and indexed, but not stored.
// Note that FileReader expects the file to be in the system's default encoding.
// Note that FileReader expects the file to be in UTF-8 encoding.
// If that's not the case searching for special characters will fail.
doc.add(new Field("contents", new FileReader(f)));
doc.add(new Field("contents", new InputStreamReader(new FileInputStream(f), "UTF-8")));
// return the document
return doc;

View File

@ -64,7 +64,8 @@ public class HTMLDocument {
doc.add(new Field("uid", uid(f), Field.Store.NO, Field.Index.NOT_ANALYZED));
FileInputStream fis = new FileInputStream(f);
HTMLParser parser = new HTMLParser(fis);
InputStreamReader reader = new InputStreamReader(fis, "UTF-8");
HTMLParser parser = new HTMLParser(reader);
// Add the tag-stripped contents as a Reader-valued Text field so it will
// get tokenized and indexed.

View File

@ -19,6 +19,7 @@ package org.apache.lucene.demo;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
@ -124,7 +125,7 @@ public class SearchFiles {
BufferedReader in = null;
if (queries != null) {
in = new BufferedReader(new FileReader(queries));
in = new BufferedReader(new InputStreamReader(new FileInputStream(queries), "UTF-8"));
} else {
in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
}

View File

@ -2,6 +2,7 @@
package org.apache.lucene.demo.html;
import java.io.*;
import java.util.Locale;
import java.util.Properties;
public class HTMLParser implements HTMLParserConstants {
@ -40,14 +41,6 @@ public class HTMLParser implements HTMLParserConstants {
}
}
/**
* @deprecated Use HTMLParser(FileInputStream) instead
*/
@Deprecated
public HTMLParser(File file) throws FileNotFoundException {
this(new FileInputStream(file));
}
public String getTitle() throws IOException, InterruptedException {
if (pipeIn == null)
getReader(); // spawn parsing thread
@ -231,7 +224,7 @@ InterruptedException {
Token t1, t2;
boolean inImg = false;
t1 = jj_consume_token(TagName);
String tagName = t1.image.toLowerCase();
String tagName = t1.image.toLowerCase(Locale.ENGLISH);
if(Tags.WS_ELEMS.contains(tagName) ) {
addSpace();
}
@ -268,7 +261,7 @@ InterruptedException {
)
&& t2 != null)
{
currentMetaTag=t2.image.toLowerCase();
currentMetaTag=t2.image.toLowerCase(Locale.ENGLISH);
if(currentMetaTag != null && currentMetaContent != null) {
addMetaTag();
}
@ -276,7 +269,7 @@ InterruptedException {
if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
null)
{
currentMetaContent=t2.image.toLowerCase();
currentMetaContent=t2.image.toLowerCase(Locale.ENGLISH);
if(currentMetaTag != null && currentMetaContent != null) {
addMetaTag();
}
@ -454,18 +447,18 @@ null)
finally { jj_save(1, xla); }
}
private boolean jj_3_1() {
if (jj_scan_token(ArgQuote1)) return true;
if (jj_scan_token(CloseQuote1)) return true;
return false;
}
private boolean jj_3_2() {
if (jj_scan_token(ArgQuote2)) return true;
if (jj_scan_token(CloseQuote2)) return true;
return false;
}
private boolean jj_3_1() {
if (jj_scan_token(ArgQuote1)) return true;
if (jj_scan_token(CloseQuote1)) return true;
return false;
}
/** Generated Token Manager. */
public HTMLParserTokenManager token_source;
SimpleCharStream jj_input_stream;

View File

@ -19,9 +19,9 @@
options {
STATIC = false;
OPTIMIZE_TOKEN_MANAGER = true;
//DEBUG_LOOKAHEAD = true;
//DEBUG_TOKEN_MANAGER = true;
UNICODE_INPUT = true;
}
PARSER_BEGIN(HTMLParser)
@ -29,6 +29,7 @@ PARSER_BEGIN(HTMLParser)
package org.apache.lucene.demo.html;
import java.io.*;
import java.util.Locale;
import java.util.Properties;
public class HTMLParser {
@ -67,14 +68,6 @@ public class HTMLParser {
}
}
/**
* @deprecated Use HTMLParser(FileInputStream) instead
*/
@Deprecated
public HTMLParser(File file) throws FileNotFoundException {
this(new FileInputStream(file));
}
public String getTitle() throws IOException, InterruptedException {
if (pipeIn == null)
getReader(); // spawn parsing thread
@ -231,7 +224,7 @@ void Tag() throws IOException :
}
{
t1=<TagName> {
String tagName = t1.image.toLowerCase();
String tagName = t1.image.toLowerCase(Locale.ENGLISH);
if(Tags.WS_ELEMS.contains(tagName) ) {
addSpace();
}
@ -253,7 +246,7 @@ void Tag() throws IOException :
)
&& t2 != null)
{
currentMetaTag=t2.image.toLowerCase();
currentMetaTag=t2.image.toLowerCase(Locale.ENGLISH);
if(currentMetaTag != null && currentMetaContent != null) {
addMetaTag();
}
@ -261,7 +254,7 @@ void Tag() throws IOException :
if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
null)
{
currentMetaContent=t2.image.toLowerCase();
currentMetaContent=t2.image.toLowerCase(Locale.ENGLISH);
if(currentMetaTag != null && currentMetaContent != null) {
addMetaTag();
}

View File

@ -1,6 +1,7 @@
/* Generated By:JavaCC: Do not edit this line. HTMLParserTokenManager.java */
package org.apache.lucene.demo.html;
import java.io.*;
import java.util.Locale;
import java.util.Properties;
/** Token Manager. */
@ -218,6 +219,9 @@ private int jjStartNfaWithStates_0(int pos, int kind, int state)
return jjMoveNfa_0(state, pos + 1);
}
static final long[] jjbitVec0 = {
0xfffffffffffffffeL, 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffffffL
};
static final long[] jjbitVec2 = {
0x0L, 0x0L, 0xffffffffffffffffL, 0xffffffffffffffffL
};
private int jjMoveNfa_0(int startState, int curPos)
@ -460,6 +464,9 @@ private int jjMoveNfa_0(int startState, int curPos)
}
else
{
int hiByte = (int)(curChar >> 8);
int i1 = hiByte >> 6;
long l1 = 1L << (hiByte & 077);
int i2 = (curChar & 0xff) >> 6;
long l2 = 1L << (curChar & 077);
do
@ -468,7 +475,7 @@ private int jjMoveNfa_0(int startState, int curPos)
{
case 22:
case 23:
if ((jjbitVec0[i2] & l2) == 0L)
if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
break;
if (kind > 2)
kind = 2;
@ -476,7 +483,7 @@ private int jjMoveNfa_0(int startState, int curPos)
break;
case 26:
case 27:
if ((jjbitVec0[i2] & l2) == 0L)
if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
break;
if (kind > 3)
kind = 3;
@ -562,6 +569,9 @@ private int jjMoveNfa_5(int startState, int curPos)
}
else
{
int hiByte = (int)(curChar >> 8);
int i1 = hiByte >> 6;
long l1 = 1L << (hiByte & 077);
int i2 = (curChar & 0xff) >> 6;
long l2 = 1L << (curChar & 077);
do
@ -570,7 +580,7 @@ private int jjMoveNfa_5(int startState, int curPos)
{
case 1:
case 0:
if ((jjbitVec0[i2] & l2) == 0L)
if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
break;
if (kind > 25)
kind = 25;
@ -660,6 +670,9 @@ private int jjMoveNfa_7(int startState, int curPos)
}
else
{
int hiByte = (int)(curChar >> 8);
int i1 = hiByte >> 6;
long l1 = 1L << (hiByte & 077);
int i2 = (curChar & 0xff) >> 6;
long l2 = 1L << (curChar & 077);
do
@ -667,7 +680,7 @@ private int jjMoveNfa_7(int startState, int curPos)
switch(jjstateSet[--i])
{
case 0:
if ((jjbitVec0[i2] & l2) == 0L)
if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
break;
if (kind > 29)
kind = 29;
@ -753,6 +766,9 @@ private int jjMoveNfa_4(int startState, int curPos)
}
else
{
int hiByte = (int)(curChar >> 8);
int i1 = hiByte >> 6;
long l1 = 1L << (hiByte & 077);
int i2 = (curChar & 0xff) >> 6;
long l2 = 1L << (curChar & 077);
do
@ -761,7 +777,7 @@ private int jjMoveNfa_4(int startState, int curPos)
{
case 1:
case 0:
if ((jjbitVec0[i2] & l2) == 0L)
if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
break;
if (kind > 23)
kind = 23;
@ -876,6 +892,9 @@ private int jjMoveNfa_3(int startState, int curPos)
}
else
{
int hiByte = (int)(curChar >> 8);
int i1 = hiByte >> 6;
long l1 = 1L << (hiByte & 077);
int i2 = (curChar & 0xff) >> 6;
long l2 = 1L << (curChar & 077);
do
@ -884,7 +903,7 @@ private int jjMoveNfa_3(int startState, int curPos)
{
case 0:
case 1:
if ((jjbitVec0[i2] & l2) == 0L)
if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
break;
if (kind > 19)
kind = 19;
@ -1042,6 +1061,9 @@ private int jjMoveNfa_6(int startState, int curPos)
}
else
{
int hiByte = (int)(curChar >> 8);
int i1 = hiByte >> 6;
long l1 = 1L << (hiByte & 077);
int i2 = (curChar & 0xff) >> 6;
long l2 = 1L << (curChar & 077);
do
@ -1050,7 +1072,7 @@ private int jjMoveNfa_6(int startState, int curPos)
{
case 1:
case 0:
if ((jjbitVec0[i2] & l2) == 0L)
if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
break;
if (kind > 27)
kind = 27;
@ -1183,6 +1205,9 @@ private int jjMoveNfa_1(int startState, int curPos)
}
else
{
int hiByte = (int)(curChar >> 8);
int i1 = hiByte >> 6;
long l1 = 1L << (hiByte & 077);
int i2 = (curChar & 0xff) >> 6;
long l2 = 1L << (curChar & 077);
do
@ -1191,14 +1216,14 @@ private int jjMoveNfa_1(int startState, int curPos)
{
case 1:
case 0:
if ((jjbitVec0[i2] & l2) == 0L)
if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
break;
if (kind > 14)
kind = 14;
jjCheckNAdd(0);
break;
case 3:
if ((jjbitVec0[i2] & l2) != 0L)
if (jjCanMove_0(hiByte, i1, i2, l1, l2))
jjAddStates(18, 19);
break;
default : break;
@ -1336,6 +1361,9 @@ private int jjMoveNfa_2(int startState, int curPos)
}
else
{
int hiByte = (int)(curChar >> 8);
int i1 = hiByte >> 6;
long l1 = 1L << (hiByte & 077);
int i2 = (curChar & 0xff) >> 6;
long l2 = 1L << (curChar & 077);
do
@ -1344,7 +1372,7 @@ private int jjMoveNfa_2(int startState, int curPos)
{
case 0:
case 1:
if ((jjbitVec0[i2] & l2) == 0L)
if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
break;
if (kind > 16)
kind = 16;
@ -1371,6 +1399,18 @@ static final int[] jjnextStates = {
20, 21, 24, 12, 14, 16, 5, 8, 0, 4, 6, 0, 4, 6, 5, 0,
4, 6, 3, 4,
};
private static final boolean jjCanMove_0(int hiByte, int i1, int i2, long l1, long l2)
{
switch(hiByte)
{
case 0:
return ((jjbitVec2[i2] & l2) != 0L);
default :
if ((jjbitVec0[i1] & l1) != 0L)
return true;
return false;
}
}
/** Token literal values. */
public static final String[] jjstrLiteralImages = {

View File

@ -0,0 +1,46 @@
package org.apache.lucene.demo;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.PrintStream;
import org.apache.lucene.util.LuceneTestCase;
public class TestDemo extends LuceneTestCase {
// LUCENE-589
public void testUnicodeHtml() throws Exception {
File dir = getDataFile("test-files/html");
File indexDir = new File(TEMP_DIR, "demoIndex");
IndexHTML.main(new String[] { "-create", "-index", indexDir.getPath(), dir.getPath() });
File queries = getDataFile("test-files/queries.txt");
PrintStream outSave = System.out;
try {
ByteArrayOutputStream bytes = new ByteArrayOutputStream();
PrintStream fakeSystemOut = new PrintStream(bytes);
System.setOut(fakeSystemOut);
SearchFiles.main(new String[] { "-index", indexDir.getPath(), "-queries", queries.getPath()});
fakeSystemOut.flush();
String output = bytes.toString(); // intentionally use default encoding
assertTrue(output.contains("1 total matching documents"));
} finally {
System.setOut(outSave);
}
}
}

View File

@ -0,0 +1,126 @@
package org.apache.lucene.demo.html;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Properties;
import org.apache.lucene.demo.html.HTMLParser;
import org.apache.lucene.util.LuceneTestCase;
public class TestHtmlParser extends LuceneTestCase {
public void testUnicode() throws Exception {
String text = "<html><body>汉语</body></html>";
HTMLParser parser = new HTMLParser(new StringReader(text));
assertReadsTo("汉语", parser);
}
public void testEntities() throws Exception {
String text = "<html><body>&#x6C49;&#x8BED;&yen;</body></html>";
HTMLParser parser = new HTMLParser(new StringReader(text));
assertReadsTo("汉语¥", parser);
}
public void testComments() throws Exception {
String text = "<html><body>foo<!-- bar --><! baz --></body></html>";
HTMLParser parser = new HTMLParser(new StringReader(text));
assertReadsTo("foo", parser);
}
public void testScript() throws Exception {
String text = "<html><body><script type=\"text/javascript\">" +
"document.write(\"test\")</script>foo</body></html>";
HTMLParser parser = new HTMLParser(new StringReader(text));
assertReadsTo("foo", parser);
}
public void testStyle() throws Exception {
String text = "<html><head><style type=\"text/css\">" +
"body{background-color:blue;}</style>" +
"</head><body>foo</body></html>";
HTMLParser parser = new HTMLParser(new StringReader(text));
assertReadsTo("foo", parser);
}
public void testDoctype() throws Exception {
String text = "<!DOCTYPE HTML PUBLIC " +
"\"-//W3C//DTD HTML 4.01 Transitional//EN\"" +
"\"http://www.w3.org/TR/html4/loose.dtd\">" +
"<html><body>foo</body></html>";
HTMLParser parser = new HTMLParser(new StringReader(text));
assertReadsTo("foo", parser);
}
public void testMeta() throws Exception {
String text = "<html><head>" +
"<meta name=\"a\" content=\"1\" />" +
"<meta name=\"b\" content=\"2\" />" +
"<meta name=\"keywords\" content=\"this is a test\" />" +
"<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\" />" +
"</head><body>foobar</body></html>";
HTMLParser parser = new HTMLParser(new StringReader(text));
Properties tags = parser.getMetaTags();
assertEquals(4, tags.size());
assertEquals("1", tags.get("a"));
assertEquals("2", tags.get("b"));
assertEquals("this is a test", tags.get("keywords"));
assertEquals("text/html;charset=utf-8", tags.get("content-type"));
}
public void testTitle() throws Exception {
String text = "<html><head><TITLE>foo</TITLE><head><body>bar</body></html>";
HTMLParser parser = new HTMLParser(new StringReader(text));
assertEquals("foo", parser.getTitle());
}
public void testSummary() throws Exception {
String text = "<html><head><TITLE>foo</TITLE><head><body>" +
"Summarize me. Summarize me. Summarize me. Summarize me. " +
"Summarize me. Summarize me. Summarize me. Summarize me. " +
"Summarize me. Summarize me. Summarize me. Summarize me. " +
"Summarize me. Summarize me. Summarize me. Summarize me. " +
"Summarize me. Summarize me. Summarize me. Summarize me. " +
"Summarize me. Summarize me. Summarize me. Summarize me. " +
"Summarize me. Summarize me. Summarize me. Summarize me. " +
"</body></html>";
HTMLParser parser = new HTMLParser(new StringReader(text));
assertEquals(200, parser.getSummary().length());
}
// LUCENE-2246
public void testTurkish() throws Exception {
String text = "<html><body>" +
"<IMG SRC=\"../images/head.jpg\" WIDTH=570 HEIGHT=47 BORDER=0 ALT=\"ş\">" +
"<a title=\"(ııı)\"></body></html>";
HTMLParser parser = new HTMLParser(new StringReader(text));
assertReadsTo("[ş]", parser);
}
private void assertReadsTo(String expected, HTMLParser parser) throws IOException {
Reader reader = parser.getReader();
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1) {
builder.append((char)ch);
}
assertEquals(expected, builder.toString());
}
}

View File

@ -0,0 +1,8 @@
<html>
<head>
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />
</head>
<body>
汉语
</body>
</html>

View File

@ -0,0 +1 @@
contents:汉语