LUCENE-589, LUCENE-2246: fix intl bugs in contrib/demo

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1031460 13f79535-47bb-0310-9956-ffa450edef68
2010-11-05 07:33:05 +00:00 · 2010-11-05 07:33:05 +00:00 · 048cdb57f4
parent 863dfccc76
commit 048cdb57f4
11 changed files with 261 additions and 44 deletions
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@ -134,6 +134,12 @@ Bug fixes

 * LUCENE-2616: FastVectorHighlighter: out of alignment when the first value is
  empty in multiValued field (Koji Sekiguchi)
+  
+* LUCENE-589: Fix contrib/demo for international documents. 
+  (Curtis d'Entremont via Robert Muir)
+  
+* LUCENE-2246: Fix contrib/demo for Turkish html documents.
+  (Selim Nadi via Robert Muir)  
   
 API Changes

--- a/lucene/contrib/demo/src/java/org/apache/lucene/demo/FileDocument.java
+++ b/lucene/contrib/demo/src/java/org/apache/lucene/demo/FileDocument.java
@ -18,7 +18,9 @@ package org.apache.lucene.demo;
 */

 import java.io.File;
+import java.io.FileInputStream;
 import java.io.FileReader;
+import java.io.InputStreamReader;

 import org.apache.lucene.document.DateTools;
 import org.apache.lucene.document.Document;
@ -40,7 +42,7 @@ public class FileDocument {
    Reader field;
    */
  public static Document Document(File f)
-       throws java.io.FileNotFoundException {
+       throws java.io.IOException {
 	 
    // make a new, empty document
    Document doc = new Document();
@ -58,9 +60,9 @@ public class FileDocument {

    // Add the contents of the file to a field named "contents".  Specify a Reader,
    // so that the text of the file is tokenized and indexed, but not stored.
-    // Note that FileReader expects the file to be in the system's default encoding.
+    // Note that FileReader expects the file to be in UTF-8 encoding.
    // If that's not the case searching for special characters will fail.
-    doc.add(new Field("contents", new FileReader(f)));
+    doc.add(new Field("contents", new InputStreamReader(new FileInputStream(f), "UTF-8")));

    // return the document
    return doc;
--- a/lucene/contrib/demo/src/java/org/apache/lucene/demo/HTMLDocument.java
+++ b/lucene/contrib/demo/src/java/org/apache/lucene/demo/HTMLDocument.java
@ -64,7 +64,8 @@ public class HTMLDocument {
    doc.add(new Field("uid", uid(f), Field.Store.NO, Field.Index.NOT_ANALYZED));

    FileInputStream fis = new FileInputStream(f);
-    HTMLParser parser = new HTMLParser(fis);
+    InputStreamReader reader = new InputStreamReader(fis, "UTF-8");
+    HTMLParser parser = new HTMLParser(reader);
      
    // Add the tag-stripped contents as a Reader-valued Text field so it will
    // get tokenized and indexed.
--- a/lucene/contrib/demo/src/java/org/apache/lucene/demo/SearchFiles.java
+++ b/lucene/contrib/demo/src/java/org/apache/lucene/demo/SearchFiles.java
@ -19,6 +19,7 @@ package org.apache.lucene.demo;

 import java.io.BufferedReader;
 import java.io.File;
+import java.io.FileInputStream;
 import java.io.FileReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
@ -124,7 +125,7 @@ public class SearchFiles {

    BufferedReader in = null;
    if (queries != null) {
-      in = new BufferedReader(new FileReader(queries));
+      in = new BufferedReader(new InputStreamReader(new FileInputStream(queries), "UTF-8"));
    } else {
      in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
    }
--- a/lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParser.java
+++ b/lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParser.java
@ -2,6 +2,7 @@
 package org.apache.lucene.demo.html;

 import java.io.*;
+import java.util.Locale;
 import java.util.Properties;

 public class HTMLParser implements HTMLParserConstants {
@ -40,14 +41,6 @@ public class HTMLParser implements HTMLParserConstants {
    }
  }

-  /**
-   * @deprecated Use HTMLParser(FileInputStream) instead
-   */
-  @Deprecated
-  public HTMLParser(File file) throws FileNotFoundException {
-    this(new FileInputStream(file));
-  }
-
  public String getTitle() throws IOException, InterruptedException {
    if (pipeIn == null)
      getReader();                                // spawn parsing thread
@ -231,7 +224,7 @@ InterruptedException {
  Token t1, t2;
  boolean inImg = false;
    t1 = jj_consume_token(TagName);
-   String tagName = t1.image.toLowerCase();
+   String tagName = t1.image.toLowerCase(Locale.ENGLISH);
   if(Tags.WS_ELEMS.contains(tagName) ) {
      addSpace();
    }
@ -268,7 +261,7 @@ InterruptedException {
                        )
           && t2 != null)
        {
-                currentMetaTag=t2.image.toLowerCase();
+                currentMetaTag=t2.image.toLowerCase(Locale.ENGLISH);
                if(currentMetaTag != null && currentMetaContent != null) {
                addMetaTag();
                }
@ -276,7 +269,7 @@ InterruptedException {
        if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
 null)
        {
-                currentMetaContent=t2.image.toLowerCase();
+                currentMetaContent=t2.image.toLowerCase(Locale.ENGLISH);
                if(currentMetaTag != null && currentMetaContent != null) {
                addMetaTag();
                }
@ -454,18 +447,18 @@ null)
    finally { jj_save(1, xla); }
  }

-  private boolean jj_3_1() {
-    if (jj_scan_token(ArgQuote1)) return true;
-    if (jj_scan_token(CloseQuote1)) return true;
-    return false;
-  }
-
  private boolean jj_3_2() {
    if (jj_scan_token(ArgQuote2)) return true;
    if (jj_scan_token(CloseQuote2)) return true;
    return false;
  }

+  private boolean jj_3_1() {
+    if (jj_scan_token(ArgQuote1)) return true;
+    if (jj_scan_token(CloseQuote1)) return true;
+    return false;
+  }
+
  /** Generated Token Manager. */
  public HTMLParserTokenManager token_source;
  SimpleCharStream jj_input_stream;
--- a/lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParser.jj
+++ b/lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParser.jj
@ -19,9 +19,9 @@

 options {
  STATIC = false;
-  OPTIMIZE_TOKEN_MANAGER = true;
  //DEBUG_LOOKAHEAD = true;
  //DEBUG_TOKEN_MANAGER = true;
+  UNICODE_INPUT = true;
 }

 PARSER_BEGIN(HTMLParser)
@ -29,6 +29,7 @@ PARSER_BEGIN(HTMLParser)
 package org.apache.lucene.demo.html;

 import java.io.*;
+import java.util.Locale;
 import java.util.Properties;

 public class HTMLParser {
@ -67,14 +68,6 @@ public class HTMLParser {
    }
  }

-  /**
-   * @deprecated Use HTMLParser(FileInputStream) instead
-   */
-  @Deprecated
-  public HTMLParser(File file) throws FileNotFoundException {
-    this(new FileInputStream(file));
-  }
-
  public String getTitle() throws IOException, InterruptedException {
    if (pipeIn == null)
      getReader();				  // spawn parsing thread
@ -231,7 +224,7 @@ void Tag() throws IOException :
 }
 {
  t1=<TagName> {
-   String tagName = t1.image.toLowerCase();
+   String tagName = t1.image.toLowerCase(Locale.ENGLISH);
   if(Tags.WS_ELEMS.contains(tagName) ) {
      addSpace();
    }
@ -253,7 +246,7 @@ void Tag() throws IOException :
 			)
 	   && t2 != null)
 	{
-		currentMetaTag=t2.image.toLowerCase();
+		currentMetaTag=t2.image.toLowerCase(Locale.ENGLISH);
 		if(currentMetaTag != null && currentMetaContent != null) {
        	addMetaTag();
 		}
@ -261,7 +254,7 @@ void Tag() throws IOException :
    	if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
 null)
 	{
-		currentMetaContent=t2.image.toLowerCase();
+		currentMetaContent=t2.image.toLowerCase(Locale.ENGLISH);
 		if(currentMetaTag != null && currentMetaContent != null) {
        	addMetaTag();
 		}
--- a/lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParserTokenManager.java
+++ b/lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParserTokenManager.java
@ -1,6 +1,7 @@
 /* Generated By:JavaCC: Do not edit this line. HTMLParserTokenManager.java */
 package org.apache.lucene.demo.html;
 import java.io.*;
+import java.util.Locale;
 import java.util.Properties;

 /** Token Manager. */
@ -218,6 +219,9 @@ private int jjStartNfaWithStates_0(int pos, int kind, int state)
   return jjMoveNfa_0(state, pos + 1);
 }
 static final long[] jjbitVec0 = {
+   0xfffffffffffffffeL, 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffffffL
+};
+static final long[] jjbitVec2 = {
   0x0L, 0x0L, 0xffffffffffffffffL, 0xffffffffffffffffL
 };
 private int jjMoveNfa_0(int startState, int curPos)
@ -460,6 +464,9 @@ private int jjMoveNfa_0(int startState, int curPos)
      }
      else
      {
+         int hiByte = (int)(curChar >> 8);
+         int i1 = hiByte >> 6;
+         long l1 = 1L << (hiByte & 077);
         int i2 = (curChar & 0xff) >> 6;
         long l2 = 1L << (curChar & 077);
         do
@ -468,7 +475,7 @@ private int jjMoveNfa_0(int startState, int curPos)
            {
               case 22:
               case 23:
-                  if ((jjbitVec0[i2] & l2) == 0L)
+                  if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
                     break;
                  if (kind > 2)
                     kind = 2;
@ -476,7 +483,7 @@ private int jjMoveNfa_0(int startState, int curPos)
                  break;
               case 26:
               case 27:
-                  if ((jjbitVec0[i2] & l2) == 0L)
+                  if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
                     break;
                  if (kind > 3)
                     kind = 3;
@ -562,6 +569,9 @@ private int jjMoveNfa_5(int startState, int curPos)
      }
      else
      {
+         int hiByte = (int)(curChar >> 8);
+         int i1 = hiByte >> 6;
+         long l1 = 1L << (hiByte & 077);
         int i2 = (curChar & 0xff) >> 6;
         long l2 = 1L << (curChar & 077);
         do
@ -570,7 +580,7 @@ private int jjMoveNfa_5(int startState, int curPos)
            {
               case 1:
               case 0:
-                  if ((jjbitVec0[i2] & l2) == 0L)
+                  if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
                     break;
                  if (kind > 25)
                     kind = 25;
@ -660,6 +670,9 @@ private int jjMoveNfa_7(int startState, int curPos)
      }
      else
      {
+         int hiByte = (int)(curChar >> 8);
+         int i1 = hiByte >> 6;
+         long l1 = 1L << (hiByte & 077);
         int i2 = (curChar & 0xff) >> 6;
         long l2 = 1L << (curChar & 077);
         do
@ -667,7 +680,7 @@ private int jjMoveNfa_7(int startState, int curPos)
            switch(jjstateSet[--i])
            {
               case 0:
-                  if ((jjbitVec0[i2] & l2) == 0L)
+                  if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
                     break;
                  if (kind > 29)
                     kind = 29;
@ -753,6 +766,9 @@ private int jjMoveNfa_4(int startState, int curPos)
      }
      else
      {
+         int hiByte = (int)(curChar >> 8);
+         int i1 = hiByte >> 6;
+         long l1 = 1L << (hiByte & 077);
         int i2 = (curChar & 0xff) >> 6;
         long l2 = 1L << (curChar & 077);
         do
@ -761,7 +777,7 @@ private int jjMoveNfa_4(int startState, int curPos)
            {
               case 1:
               case 0:
-                  if ((jjbitVec0[i2] & l2) == 0L)
+                  if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
                     break;
                  if (kind > 23)
                     kind = 23;
@ -876,6 +892,9 @@ private int jjMoveNfa_3(int startState, int curPos)
      }
      else
      {
+         int hiByte = (int)(curChar >> 8);
+         int i1 = hiByte >> 6;
+         long l1 = 1L << (hiByte & 077);
         int i2 = (curChar & 0xff) >> 6;
         long l2 = 1L << (curChar & 077);
         do
@ -884,7 +903,7 @@ private int jjMoveNfa_3(int startState, int curPos)
            {
               case 0:
               case 1:
-                  if ((jjbitVec0[i2] & l2) == 0L)
+                  if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
                     break;
                  if (kind > 19)
                     kind = 19;
@ -1042,6 +1061,9 @@ private int jjMoveNfa_6(int startState, int curPos)
      }
      else
      {
+         int hiByte = (int)(curChar >> 8);
+         int i1 = hiByte >> 6;
+         long l1 = 1L << (hiByte & 077);
         int i2 = (curChar & 0xff) >> 6;
         long l2 = 1L << (curChar & 077);
         do
@ -1050,7 +1072,7 @@ private int jjMoveNfa_6(int startState, int curPos)
            {
               case 1:
               case 0:
-                  if ((jjbitVec0[i2] & l2) == 0L)
+                  if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
                     break;
                  if (kind > 27)
                     kind = 27;
@ -1183,6 +1205,9 @@ private int jjMoveNfa_1(int startState, int curPos)
      }
      else
      {
+         int hiByte = (int)(curChar >> 8);
+         int i1 = hiByte >> 6;
+         long l1 = 1L << (hiByte & 077);
         int i2 = (curChar & 0xff) >> 6;
         long l2 = 1L << (curChar & 077);
         do
@ -1191,14 +1216,14 @@ private int jjMoveNfa_1(int startState, int curPos)
            {
               case 1:
               case 0:
-                  if ((jjbitVec0[i2] & l2) == 0L)
+                  if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
                     break;
                  if (kind > 14)
                     kind = 14;
                  jjCheckNAdd(0);
                  break;
               case 3:
-                  if ((jjbitVec0[i2] & l2) != 0L)
+                  if (jjCanMove_0(hiByte, i1, i2, l1, l2))
                     jjAddStates(18, 19);
                  break;
               default : break;
@ -1336,6 +1361,9 @@ private int jjMoveNfa_2(int startState, int curPos)
      }
      else
      {
+         int hiByte = (int)(curChar >> 8);
+         int i1 = hiByte >> 6;
+         long l1 = 1L << (hiByte & 077);
         int i2 = (curChar & 0xff) >> 6;
         long l2 = 1L << (curChar & 077);
         do
@ -1344,7 +1372,7 @@ private int jjMoveNfa_2(int startState, int curPos)
            {
               case 0:
               case 1:
-                  if ((jjbitVec0[i2] & l2) == 0L)
+                  if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
                     break;
                  if (kind > 16)
                     kind = 16;
@ -1371,6 +1399,18 @@ static final int[] jjnextStates = {
   20, 21, 24, 12, 14, 16, 5, 8, 0, 4, 6, 0, 4, 6, 5, 0, 
   4, 6, 3, 4, 
 };
+private static final boolean jjCanMove_0(int hiByte, int i1, int i2, long l1, long l2)
+{
+   switch(hiByte)
+   {
+      case 0:
+         return ((jjbitVec2[i2] & l2) != 0L);
+      default :
+         if ((jjbitVec0[i1] & l1) != 0L)
+            return true;
+         return false;
+   }
+}

 /** Token literal values. */
 public static final String[] jjstrLiteralImages = {
--- a/lucene/contrib/demo/src/test/org/apache/lucene/demo/TestDemo.java
+++ b/lucene/contrib/demo/src/test/org/apache/lucene/demo/TestDemo.java
@ -0,0 +1,46 @@
+package org.apache.lucene.demo;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.PrintStream;
+
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestDemo extends LuceneTestCase {
+  // LUCENE-589
+  public void testUnicodeHtml() throws Exception {
+    File dir = getDataFile("test-files/html");
+    File indexDir = new File(TEMP_DIR, "demoIndex");
+    IndexHTML.main(new String[] { "-create", "-index", indexDir.getPath(), dir.getPath() });
+    File queries = getDataFile("test-files/queries.txt");
+    PrintStream outSave = System.out;
+    try {
+      ByteArrayOutputStream bytes = new ByteArrayOutputStream();
+      PrintStream fakeSystemOut = new PrintStream(bytes);
+      System.setOut(fakeSystemOut);
+      SearchFiles.main(new String[] { "-index", indexDir.getPath(), "-queries", queries.getPath()});
+      fakeSystemOut.flush();
+      String output = bytes.toString(); // intentionally use default encoding
+      assertTrue(output.contains("1 total matching documents"));
+    } finally {
+      System.setOut(outSave);
+    }
+  }
+}
--- a/lucene/contrib/demo/src/test/org/apache/lucene/demo/html/TestHtmlParser.java
+++ b/lucene/contrib/demo/src/test/org/apache/lucene/demo/html/TestHtmlParser.java
@ -0,0 +1,126 @@
+package org.apache.lucene.demo.html;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Properties;
+
+import org.apache.lucene.demo.html.HTMLParser;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestHtmlParser extends LuceneTestCase {
+
+  public void testUnicode() throws Exception {
+    String text = "<html><body>汉语</body></html>";
+    HTMLParser parser = new HTMLParser(new StringReader(text));
+    assertReadsTo("汉语", parser);
+  }
+  
+  public void testEntities() throws Exception {
+    String text = "<html><body>&#x6C49;&#x8BED;&yen;</body></html>";
+    HTMLParser parser = new HTMLParser(new StringReader(text));
+    assertReadsTo("汉语¥", parser);
+  }
+  
+  public void testComments() throws Exception {
+    String text = "<html><body>foo<!-- bar --><! baz --></body></html>";
+    HTMLParser parser = new HTMLParser(new StringReader(text));
+    assertReadsTo("foo", parser);
+  }
+  
+  public void testScript() throws Exception {
+    String text = "<html><body><script type=\"text/javascript\">" +
+                  "document.write(\"test\")</script>foo</body></html>"; 
+    HTMLParser parser = new HTMLParser(new StringReader(text));
+    assertReadsTo("foo", parser);
+  }
+  
+  public void testStyle() throws Exception {
+    String text = "<html><head><style type=\"text/css\">" +
+                  "body{background-color:blue;}</style>" +
+                  "</head><body>foo</body></html>";
+    HTMLParser parser = new HTMLParser(new StringReader(text));
+    assertReadsTo("foo", parser);
+  }
+  
+  public void testDoctype() throws Exception {
+    String text = "<!DOCTYPE HTML PUBLIC " + 
+    "\"-//W3C//DTD HTML 4.01 Transitional//EN\"" +
+    "\"http://www.w3.org/TR/html4/loose.dtd\">" +
+    "<html><body>foo</body></html>";
+    HTMLParser parser = new HTMLParser(new StringReader(text));
+    assertReadsTo("foo", parser);
+  }
+  
+  public void testMeta() throws Exception {
+    String text = "<html><head>" +
+    "<meta name=\"a\" content=\"1\" />" +
+    "<meta name=\"b\" content=\"2\" />" +
+    "<meta name=\"keywords\" content=\"this is a test\" />" +
+    "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\" />" +
+    "</head><body>foobar</body></html>";
+    HTMLParser parser = new HTMLParser(new StringReader(text));
+    Properties tags = parser.getMetaTags();
+    assertEquals(4, tags.size());
+    assertEquals("1", tags.get("a"));
+    assertEquals("2", tags.get("b"));
+    assertEquals("this is a test", tags.get("keywords"));
+    assertEquals("text/html;charset=utf-8", tags.get("content-type"));
+  }
+  
+  public void testTitle() throws Exception {
+    String text = "<html><head><TITLE>foo</TITLE><head><body>bar</body></html>";
+    HTMLParser parser = new HTMLParser(new StringReader(text));
+    assertEquals("foo", parser.getTitle());
+  }
+  
+  public void testSummary() throws Exception {
+    String text = "<html><head><TITLE>foo</TITLE><head><body>" + 
+    "Summarize me. Summarize me. Summarize me. Summarize me. " + 
+    "Summarize me. Summarize me. Summarize me. Summarize me. " + 
+    "Summarize me. Summarize me. Summarize me. Summarize me. " + 
+    "Summarize me. Summarize me. Summarize me. Summarize me. " + 
+    "Summarize me. Summarize me. Summarize me. Summarize me. " + 
+    "Summarize me. Summarize me. Summarize me. Summarize me. " + 
+    "Summarize me. Summarize me. Summarize me. Summarize me. " + 
+    "</body></html>";
+    HTMLParser parser = new HTMLParser(new StringReader(text));
+    assertEquals(200, parser.getSummary().length());
+  }
+  
+  // LUCENE-2246
+  public void testTurkish() throws Exception {
+    String text = "<html><body>" +
+    "<IMG SRC=\"../images/head.jpg\" WIDTH=570 HEIGHT=47 BORDER=0 ALT=\"ş\">" +
+    "<a title=\"(ııı)\"></body></html>";
+    HTMLParser parser = new HTMLParser(new StringReader(text));
+    assertReadsTo("[ş]", parser);
+  }
+  
+  private void assertReadsTo(String expected, HTMLParser parser) throws IOException {
+    Reader reader = parser.getReader();
+    StringBuilder builder = new StringBuilder();
+    int ch = 0;
+    while ((ch = reader.read()) != -1) {
+      builder.append((char)ch);
+    }
+    assertEquals(expected, builder.toString());
+  }
+}
--- a/lucene/contrib/demo/src/test/org/apache/lucene/demo/test-files/html/test1.html
+++ b/lucene/contrib/demo/src/test/org/apache/lucene/demo/test-files/html/test1.html
@ -0,0 +1,8 @@
+<html>
+  <head>
+    <meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />
+  </head>
+  <body>
+       汉语
+  </body>
+</html>
--- a/lucene/contrib/demo/src/test/org/apache/lucene/demo/test-files/queries.txt
+++ b/lucene/contrib/demo/src/test/org/apache/lucene/demo/test-files/queries.txt
@ -0,0 +1 @@
+contents:汉语