SOLR-477: Added in AnalysisRequestHandler and tests/sample documents

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@629342 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2008-02-20 04:06:33 +00:00
parent e1f97aff7e
commit d5ef760bbc
7 changed files with 458 additions and 0 deletions

View File

@ -0,0 +1,55 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<docs>
<doc>
<field name="id">TWINX2048-3200PRO</field>
<field name="name">CORSAIR XMS 2GB (2 x 1GB) 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) Dual Channel Kit System Memory - Retail</field>
<field name="manu">Corsair Microsystems Inc.</field>
<field name="cat">electronics</field>
<field name="cat">memory</field>
<field name="features">CAS latency 2, 2-3-3-6 timing, 2.75v, unbuffered, heat-spreader</field>
<field name="price">185</field>
<field name="popularity">5</field>
<field name="inStock">true</field>
</doc>
<doc>
<field name="id">VS1GB400C3</field>
<field name="name">CORSAIR ValueSelect 1GB 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) System Memory - Retail</field>
<field name="manu">Corsair Microsystems Inc.</field>
<field name="cat">electronics</field>
<field name="cat">memory</field>
<field name="price">74.99</field>
<field name="popularity">7</field>
<field name="inStock">true</field>
</doc>
<doc>
<field name="id">VDBDB1A16</field>
<field name="name">A-DATA V-Series 1GB 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) System Memory - OEM</field>
<field name="manu">A-DATA Technology Inc.</field>
<field name="cat">electronics</field>
<field name="cat">memory</field>
<field name="features">CAS latency 3, 2.7v</field>
<!-- note: price is missing on this one -->
<field name="popularity">5</field>
<field name="inStock">true</field>
</doc>
</docs>

View File

@ -0,0 +1,24 @@
#!/bin/sh
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
FILES=$*
URL=http://localhost:8983/solr/analysis
for f in $FILES; do
echo Posting file $f to $URL
curl $URL --data-binary @$f -H 'Content-type:text/xml; charset=utf-8'
echo
done

View File

@ -0,0 +1,24 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<docs>
<doc>
<field name="id">TWINX2048-3200PRO</field>
<field name="name">CORSAIR XMS 2GB (2 x 1GB) 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) Dual Channel Kit System Memory - Retail</field>
</doc>
</docs>

View File

@ -505,6 +505,16 @@
-->
</requestHandler>
<!--
Analysis request handler. Since Solr 1.3. Use to returnhow a document is analyzed. Useful
for debugging and as a token server for other types of applications
-->
<requestHandler name="/analysis" class="solr.AnalysisRequestHandler" >
<!--
<str name="update.processor.class">org.apache.solr.handler.UpdateRequestProcessor</str>
-->
</requestHandler>
<!-- CSV update handler, loaded on demand -->
<requestHandler name="/update/csv" class="solr.CSVRequestHandler" startup="lazy" />

View File

@ -82,6 +82,10 @@ public class XML {
escape(str, out, attribute_escapes);
}
public static void escapeAttributeValue(char [] chars, int start, int length, Writer out) throws IOException {
escape(chars, start, length, out, attribute_escapes);
}
public final static void writeXML(Writer out, String tag, String val) throws IOException {
out.write('<');
@ -149,6 +153,19 @@ public class XML {
}
}
private static void escape(char [] chars, int offset, int length, Writer out, String [] escapes) throws IOException{
for (int i=offset; i<length; i++) {
char ch = chars[i];
if (ch<escapes.length) {
String replacement = escapes[ch];
if (replacement != null) {
out.write(replacement);
continue;
}
}
out.write(ch);
}
}
private static void escape(String str, Writer out, String[] escapes) throws IOException {
for (int i=0; i<str.length(); i++) {

View File

@ -0,0 +1,223 @@
package org.apache.solr.handler;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import javanet.staxutils.BaseXMLInputFactory;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrQueryResponse;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.logging.Logger;
/**
*
*
**/
public class AnalysisRequestHandler extends RequestHandlerBase {
public static Logger log = Logger.getLogger(AnalysisRequestHandler.class.getName());
private XMLInputFactory inputFactory;
@Override
public void init(NamedList args) {
super.init(args);
inputFactory = BaseXMLInputFactory.newInstance();
try {
// The java 1.6 bundled stax parser (sjsxp) does not currently have a thread-safe
// XMLInputFactory, as that implementation tries to cache and reuse the
// XMLStreamReader. Setting the parser-specific "reuse-instance" property to false
// prevents this.
// All other known open-source stax parsers (and the bea ref impl)
// have thread-safe factories.
inputFactory.setProperty("reuse-instance", Boolean.FALSE);
}
catch (IllegalArgumentException ex) {
// Other implementations will likely throw this exception since "reuse-instance"
// isimplementation specific.
log.fine("Unable to set the 'reuse-instance' property for the input factory: " + inputFactory);
}
}
public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception {
RequestHandlerUtils.addExperimentalFormatWarning(rsp);
SolrParams params = req.getParams();
Iterable<ContentStream> streams = req.getContentStreams();
if (streams != null) {
for (ContentStream stream : req.getContentStreams()) {
Reader reader = stream.getReader();
try {
XMLStreamReader parser = inputFactory.createXMLStreamReader(reader);
NamedList<Object> result = processContent(parser, req.getSchema());
rsp.add("response", result);
}
finally {
IOUtils.closeQuietly(reader);
}
}
}
}
NamedList<Object> processContent(XMLStreamReader parser,
IndexSchema schema) throws XMLStreamException, IOException {
NamedList<Object> result = new NamedList<Object>();
while (true) {
int event = parser.next();
switch (event) {
case XMLStreamConstants.END_DOCUMENT: {
parser.close();
return result;
}
case XMLStreamConstants.START_ELEMENT: {
String currTag = parser.getLocalName();
if ("doc".equals(currTag)) {
log.finest("Tokenizing doc...");
SolrInputDocument doc = readDoc(parser);
SchemaField uniq = schema.getUniqueKeyField();
NamedList<NamedList<NamedList<Object>>> theTokens = new NamedList<NamedList<NamedList<Object>>>();
result.add(doc.getFieldValue(uniq.getName()).toString(), theTokens);
for (String name : doc.getFieldNames()) {
FieldType ft = schema.getFieldType(name);
Analyzer analyzer = ft.getAnalyzer();
Collection<Object> vals = doc.getFieldValues(name);
for (Object val : vals) {
Reader reader = new StringReader(val.toString());
TokenStream tstream = analyzer.tokenStream(name, reader);
NamedList<NamedList<Object>> tokens = getTokens(tstream);
theTokens.add(name, tokens);
}
}
}
break;
}
}
}
}
static NamedList<NamedList<Object>> getTokens(TokenStream tstream) throws IOException {
NamedList<NamedList<Object>> tokens = new NamedList<NamedList<Object>>();
Token t = null;
while (((t = tstream.next()) != null)) {
NamedList<Object> token = new NamedList<Object>();
tokens.add("token", token);
token.add("value", new String(t.termBuffer(), 0, t.termLength()));
token.add("start", t.startOffset());
token.add("end", t.endOffset());
token.add("posInc", t.getPositionIncrement());
token.add("type", t.type());
//TODO: handle payloads
}
return tokens;
}
SolrInputDocument readDoc(XMLStreamReader parser) throws XMLStreamException {
SolrInputDocument doc = new SolrInputDocument();
StringBuilder text = new StringBuilder();
String name = null;
String attrName = "";
float boost = 1.0f;
boolean isNull = false;
while (true) {
int event = parser.next();
switch (event) {
// Add everything to the text
case XMLStreamConstants.SPACE:
case XMLStreamConstants.CDATA:
case XMLStreamConstants.CHARACTERS:
text.append(parser.getText());
break;
case XMLStreamConstants.END_ELEMENT:
if ("doc".equals(parser.getLocalName())) {
return doc;
} else if ("field".equals(parser.getLocalName())) {
if (!isNull) {
doc.addField(name, text.toString(), boost);
boost = 1.0f;
}
}
break;
case XMLStreamConstants.START_ELEMENT:
text.setLength(0);
String localName = parser.getLocalName();
if (!"field".equals(localName)) {
log.warning("unexpected XML tag doc/" + localName);
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"unexpected XML tag doc/" + localName);
}
String attrVal = "";
for (int i = 0; i < parser.getAttributeCount(); i++) {
attrName = parser.getAttributeLocalName(i);
attrVal = parser.getAttributeValue(i);
if ("name".equals(attrName)) {
name = attrVal;
}
}
break;
}
}
}
//////////////////////// SolrInfoMBeans methods //////////////////////
@Override
public String getDescription() {
return "Provide Analysis of text";
}
@Override
public String getVersion() {
return "$Revision:$";
}
@Override
public String getSourceId() {
return "$Id:$";
}
@Override
public String getSource() {
return "$URL:$";
}
}

View File

@ -0,0 +1,105 @@
package org.apache.solr.handler;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import javanet.staxutils.BaseXMLInputFactory;
import org.apache.lucene.analysis.Token;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.util.AbstractSolrTestCase;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamReader;
import java.io.StringReader;
import java.util.List;
public class AnalysisRequestHandlerTest extends AbstractSolrTestCase {
private XMLInputFactory inputFactory = BaseXMLInputFactory.newInstance();
@Override
public String getSchemaFile() {
return "schema.xml";
}
@Override
public String getSolrConfigFile() {
return "solrconfig.xml";
}
public void testReadDoc() throws Exception {
String xml =
"<docs><doc >" +
" <field name=\"id\" >12345</field>" +
" <field name=\"name\">cute little kitten</field>" +
" <field name=\"text\">the quick red fox jumped over the lazy brown dogs</field>" +
"</doc>" +
"<doc >" +
" <field name=\"id\" >12346</field>" +
" <field name=\"name\">big mean dog</field>" +
" <field name=\"text\">cats like to purr</field>" +
"</doc>" +
"</docs>";
XMLStreamReader parser =
inputFactory.createXMLStreamReader(new StringReader(xml));
AnalysisRequestHandler handler = new AnalysisRequestHandler();
NamedList<Object> result = handler.processContent(parser, h.getCore().getSchema());
assertTrue("result is null and it shouldn't be", result != null);
NamedList<NamedList<NamedList<Object>>> theTokens = (NamedList<NamedList<NamedList<Object>>>) result.get("12345");
assertTrue("theTokens is null and it shouldn't be", theTokens != null);
NamedList<NamedList<Object>> tokens = theTokens.get("name");
assertTrue("tokens is null and it shouldn't be", tokens != null);
assertTrue("tokens Size: " + tokens.size() + " is not : " + 3, tokens.size() == 3);
NamedList<Object> token;
String value;
token = tokens.get("token", 0);
value = (String) token.get("value");
assertTrue(value + " is not equal to " + "cute", value.equals("cute") == true);
token = tokens.get("token", 1);
value = (String) token.get("value");
assertTrue(value + " is not equal to " + "little", value.equals("little") == true);
token = tokens.get("token", 2);
value = (String) token.get("value");
assertTrue(value + " is not equal to " + "kitten", value.equals("kitten") == true);
tokens = theTokens.get("text");
assertTrue("tokens is null and it shouldn't be", tokens != null);
assertTrue("tokens Size: " + tokens.size() + " is not : " + 8, tokens.size() == 8);//stopwords are removed
String[] gold = new String[]{"quick", "red", "fox", "jump", "over", "lazi", "brown", "dog"};
for (int j = 0; j < gold.length; j++) {
NamedList<Object> tok = tokens.get("token", j);
value = (String) tok.get("value");
assertTrue(value + " is not equal to " + gold[j], value.equals(gold[j]) == true);
}
theTokens = (NamedList<NamedList<NamedList<Object>>>) result.get("12346");
assertTrue("theTokens is null and it shouldn't be", theTokens != null);
tokens = theTokens.get("name");
assertTrue("tokens is null and it shouldn't be", tokens != null);
assertTrue("tokens Size: " + tokens.size() + " is not : " + 3, tokens.size() == 3);
gold = new String[]{"cat", "like", "purr"};
tokens = theTokens.get("text");
assertTrue("tokens is null and it shouldn't be", tokens != null);
assertTrue("tokens Size: " + tokens.size() + " is not : " + 3, tokens.size() == 3);//stopwords are removed
for (int j = 0; j < gold.length; j++) {
NamedList<Object> tok = tokens.get("token", j);
value = (String) tok.get("value");
assertTrue(value + " is not equal to " + gold[j], value.equals(gold[j]) == true);
}
}
}