SOLR-5147 Support child documents in DIH

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1652360 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Noble Paul 2015-01-16 08:33:35 +00:00
parent d5c52bc915
commit e32c7b7db9
6 changed files with 421 additions and 3 deletions

View File

@ -307,6 +307,8 @@ New Features
for Collapse and Expand (Joel Bernstein)
* SOLR-6937: In schemaless mode ,replace spaces and special characters with underscore (Noble Paul)
* SOLR-5147: Support child documents in DIH (Shawn Heisey, Thomas Champagne, Mikhail Khludnev via Noble Paul
Bug Fixes
----------------------

View File

@ -492,15 +492,29 @@ public class DocBuilder {
getDebugLogger().log(DIHLogLevels.ENTITY_OUT, epw.getEntity().getName(), arow);
}
importStatistics.rowsCount.incrementAndGet();
DocWrapper childDoc = null;
if (doc != null) {
handleSpecialCommands(arow, doc);
addFields(epw.getEntity(), doc, arow, vr);
if (epw.getEntity().isChild()) {
childDoc = new DocWrapper();
handleSpecialCommands(arow, childDoc);
addFields(epw.getEntity(), childDoc, arow, vr);
doc.addChildDocument(childDoc);
} else {
handleSpecialCommands(arow, doc);
addFields(epw.getEntity(), doc, arow, vr);
}
}
if (epw.getEntity().getChildren() != null) {
vr.addNamespace(epw.getEntity().getName(), arow);
for (EntityProcessorWrapper child : epw.getChildren()) {
buildDocument(vr, doc,
if (childDoc != null) {
buildDocument(vr, childDoc,
child.getEntity().isDocRoot() ? pk : null, child, false, ctx, entitiesToDestroy);
} else {
buildDocument(vr, doc,
child.getEntity().isDocRoot() ? pk : null, child, false, ctx, entitiesToDestroy);
}
}
vr.removeNamespace(epw.getEntity().getName());
}

View File

@ -40,6 +40,8 @@ public class ConfigNameConstants {
public static final String IMPORTER_NS_SHORT = "dih";
public static final String ROOT_ENTITY = "rootEntity";
public static final String CHILD = "child";
public static final String FUNCTION = "function";

View File

@ -41,6 +41,7 @@ public class Entity {
private final String processorName;
private final Entity parentEntity;
private final boolean docRoot;
private final boolean child;
private final List<Entity> children;
private final List<EntityField> fields;
private final Map<String,Set<EntityField>> colNameVsField;
@ -77,6 +78,9 @@ public class Entity {
docRoot = false;
}
String childValue = ConfigParseUtil.getStringAttribute(element, ConfigNameConstants.CHILD, null);
child = "true".equals(childValue);
Map<String,String> modAttributes = ConfigParseUtil
.getAllAttributes(element);
modAttributes.put(ConfigNameConstants.DATA_SRC, this.dataSourceName);
@ -219,4 +223,8 @@ public class Entity {
public List<Map<String,String>> getAllFieldsList() {
return allFieldAttributes;
}
public boolean isChild() {
return child;
}
}

View File

@ -43,6 +43,7 @@
<field name="DO_NOT_INDEX" type="ignored" />
<field name="_version_" type="tlong" indexed="true" stored="true" multiValued="false"/>
<field name="_root_" type="string" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="*_i" type="tint" indexed="true" stored="true"/>
<dynamicField name="*_s" type="string" indexed="true" stored="true"/>

View File

@ -0,0 +1,391 @@
package org.apache.solr.handler.dataimport;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.StoredDocument;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryWrapperFilter;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.join.BitDocIdSetCachingWrapperFilter;
import org.apache.lucene.search.join.BitDocIdSetFilter;
import org.apache.lucene.search.join.ScoreMode;
import org.apache.lucene.search.join.ToParentBlockJoinQuery;
import org.apache.solr.handler.dataimport.config.ConfigNameConstants;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.search.SolrIndexSearcher;
import org.junit.After;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
/**
* Test for DocBuilder using the test harness.
* <b> Documents are hierarchical in this test, i.e. each document have nested children documents.</b>
*/
public class TestHierarchicalDocBuilder extends AbstractDataImportHandlerTestCase {
private static final String FIELD_ID = "id";
private int id = 0; //unique id
private SolrQueryRequest req;
private static class ContextHolder {
int counter = 0;
List<Hierarchy> hierarchies = new ArrayList<Hierarchy>();
}
private static class Hierarchy {
String elementType;
Map<String, Object> elementData = new HashMap<String,Object>();
List<Hierarchy> elements = new ArrayList<Hierarchy>();
}
@BeforeClass
public static void beforeClass() throws Exception {
initCore("dataimport-solrconfig.xml", "dataimport-schema.xml");
}
@Before
public void before() {
req = req("*:*"); // don't really care about query
MockDataSource.clearCache();
}
@After
public void after() {
req.close();
MockDataSource.clearCache();
}
@Test
public void testThreeLevelHierarchy() throws Exception {
int parentsNum = 3; //fixed for simplicity of test
int childrenNum = 0;
int grandChildrenNum = 0;
final String parentType = "parent";
final String childType = "child";
final String grandChildType = "grand_child";
List<String> parentIds = createDataIterator("select * from PARENT", parentType, parentType, parentsNum);
Collections.shuffle(parentIds);
String parentId1 = parentIds.get(0);
String parentId2 = parentIds.get(1);
//parent 1 children
int firstParentChildrenNum = 3; //fixed for simplicity of test
String select = "select * from CHILD where parent_id='" + parentId1 + "'";
List<String> childrenIds = createDataIterator(select, childType, "child of first parent", firstParentChildrenNum);
List<String> firstParentChildrenIds = new ArrayList<String>(childrenIds);
childrenNum += childrenIds.size();
// grand children of first parent first child
String childId = childrenIds.get(0);
String description = "grandchild of first parent, child of " + childId + " child";
select = "select * from GRANDCHILD where parent_id='" + childId + "'";
List<String> grandChildrenIds = createDataIterator(select, grandChildType, description, atLeast(2));
grandChildrenNum += grandChildrenIds.size();
// grand children of first parent second child
childId = childrenIds.get(1);
description = "grandchild of first parent, child of " + childId + " child";
select = "select * from GRANDCHILD where parent_id='" + childId + "'";
List<String> grandChildrenIds2 = createDataIterator(select, grandChildType, description, atLeast(2));
grandChildrenNum += grandChildrenIds2.size();
grandChildrenIds.addAll(grandChildrenIds2);
// third children of first parent has no grand children
// parent 2 children (no grand children)
select = "select * from CHILD where parent_id='" + parentId2 + "'";
childrenIds = createDataIterator(select, childType, "child of second parent", atLeast(2));
childrenNum += childrenIds.size();
// parent 3 has no children and grand children
int totalDocsNum = parentsNum + childrenNum + grandChildrenNum;
runFullImport(threeLevelHierarchyConfig);
assertTrue("Update request processor processAdd was not called", TestUpdateRequestProcessor.processAddCalled);
assertTrue("Update request processor processCommit was not callled", TestUpdateRequestProcessor.processCommitCalled);
assertTrue("Update request processor finish was not called", TestUpdateRequestProcessor.finishCalled);
// very simple asserts to check that we at least have correct num of docs indexed
assertQ(req("*:*"), "//*[@numFound='" + totalDocsNum + "']");
assertQ(req("type_s:parent"), "//*[@numFound='" + parentsNum + "']");
assertQ(req("type_s:child"), "//*[@numFound='" + childrenNum + "']");
assertQ(req("type_s:grand_child"), "//*[@numFound='" + grandChildrenNum + "']");
// let's check BlockJoin
// get first parent by any grand children
String randomGrandChildId = grandChildrenIds.get(random().nextInt(grandChildrenIds.size()));
Query query = createToParentQuery(parentType, FIELD_ID, randomGrandChildId);
assertSearch(query, FIELD_ID, parentId1);
// get first parent by any children
String randomChildId = firstParentChildrenIds.get(random().nextInt(firstParentChildrenIds.size()));
query = createToParentQuery(parentType, FIELD_ID, randomChildId);
assertSearch(query, FIELD_ID, parentId1);
// get parent by children by grand children
randomGrandChildId = grandChildrenIds.get(random().nextInt(grandChildrenIds.size()));
ToParentBlockJoinQuery childBlockJoinQuery = createToParentQuery(childType, FIELD_ID, randomGrandChildId);
ToParentBlockJoinQuery blockJoinQuery = new ToParentBlockJoinQuery(childBlockJoinQuery, createParentFilter(parentType), ScoreMode.Avg);
assertSearch(blockJoinQuery, FIELD_ID, parentId1);
}
@Test
public void testRandomDepthHierarchy() throws Exception {
final String parentType = "parent";
int parentsNum = atLeast(5);
int depth = atLeast(3);
ContextHolder holder = new ContextHolder();
String config = createRandomizedConfig(depth, parentType, parentsNum, holder);
runFullImport(config);
assertTrue("Update request processor processAdd was not called", TestUpdateRequestProcessor.processAddCalled);
assertTrue("Update request processor processCommit was not callled", TestUpdateRequestProcessor.processCommitCalled);
assertTrue("Update request processor finish was not called", TestUpdateRequestProcessor.finishCalled);
assertQ(req("type_s:" + parentType), "//*[@numFound='" + parentsNum + "']");
assertQ(req("-type_s:"+ parentType), "//*[@numFound='" + (holder.counter - parentsNum) + "']");
// let's check BlockJoin
Hierarchy randomHierarchy = holder.hierarchies.get(random().nextInt(holder.hierarchies.size()));
Query deepestQuery = createBlockJoinQuery(randomHierarchy);
assertSearch(deepestQuery, FIELD_ID, (String) randomHierarchy.elementData.get(FIELD_ID));
}
private Query createBlockJoinQuery(Hierarchy hierarchy) {
List<Hierarchy> elements = hierarchy.elements;
if (elements.isEmpty()) {
BooleanQuery childQuery = new BooleanQuery();
childQuery.add(new TermQuery(new Term(FIELD_ID, (String) hierarchy.elementData.get(FIELD_ID))), Occur.MUST);
return childQuery;
}
Query childQuery = createBlockJoinQuery(elements.get(random().nextInt(elements.size())));
return createToParentQuery(hierarchy.elementType, childQuery);
}
private ToParentBlockJoinQuery createToParentQuery(String parentType, String childField, String childFieldValue) {
BooleanQuery childQuery = new BooleanQuery();
childQuery.add(new TermQuery(new Term(childField, childFieldValue)), Occur.MUST);
ToParentBlockJoinQuery result = createToParentQuery(parentType, childQuery);
return result;
}
private ToParentBlockJoinQuery createToParentQuery(String parentType, Query childQuery) {
ToParentBlockJoinQuery blockJoinQuery = new ToParentBlockJoinQuery(childQuery, createParentFilter(parentType), ScoreMode.Avg);
return blockJoinQuery;
}
private void assertSearch(Query query, String field, String... values) throws IOException {
/* The limit of search queue is doubled to catch the error in case when for some reason there are more docs than expected */
SolrIndexSearcher searcher = req.getSearcher();
TopDocs result = searcher.search(query, values.length * 2);
assertEquals(values.length, result.totalHits);
List<String> actualValues = new ArrayList<String>();
for (int index = 0; index < values.length; ++index) {
StoredDocument doc = searcher.doc(result.scoreDocs[index].doc);
actualValues.add(doc.get(field));
}
for (String expectedValue: values) {
boolean removed = actualValues.remove(expectedValue);
if (!removed) {
fail("Search result does not contain expected values");
}
}
}
@SuppressWarnings("unchecked")
private List<String> createDataIterator(String query, String type, String description, int count) {
List<Map<String, Object>> data = new ArrayList<Map<String, Object>>();
List<String> ids = new ArrayList<String>(count);
for (int index = 0; index < count; ++index) {
String docId = nextId();
ids.add(docId);
Map<String, Object> doc = createMap(FIELD_ID, docId, "desc", docId + " " + description, "type_s", type);
data.add(doc);
}
Collections.shuffle(data, random());
MockDataSource.setIterator(query, data.iterator());
return ids;
}
/** Internally configures MockDataSource **/
private String createRandomizedConfig(int depth, String parentType, int parentsNum, ContextHolder holder) {
List<Hierarchy> parentData = createMockedIterator(parentType, "SELECT * FROM " + parentType, parentsNum, holder);
// each map represents parent and each parent is root of separate hierarchy
holder.hierarchies = parentData;
String children = createChildren(parentType, 0, depth, parentData, holder);
String rootFields = createFieldsList(FIELD_ID, "desc", "type_s");
String rootEntity = MessageFormat.format(rootEntityTemplate, parentType, "SELECT * FROM " + parentType, rootFields, children);
String config = MessageFormat.format(dataConfigTemplate, rootEntity);
return config;
}
@SuppressWarnings("unchecked")
private List<Hierarchy> createMockedIterator(String type, String query, int amount, ContextHolder holder) {
List<Hierarchy> hierarchies = new ArrayList<Hierarchy>();
List<Map<String, Object>> data = new ArrayList<Map<String, Object>>();
for (int index = 0; index < amount; ++index) {
holder.counter++;
String idStr = String.valueOf(holder.counter);
Map<String, Object> element = createMap(FIELD_ID, idStr, "desc", type + "_" + holder.counter, "type_s", type);
data.add(element);
Hierarchy hierarchy = new Hierarchy();
hierarchy.elementType = type;
hierarchy.elementData = element;
hierarchies.add(hierarchy);
}
MockDataSource.setIterator(query, data.iterator());
return hierarchies;
}
private List<Hierarchy> createMockedIterator(String type, List<Hierarchy> parentData, ContextHolder holder) {
List<Hierarchy> result = new ArrayList<Hierarchy>();
for (Hierarchy parentHierarchy: parentData) {
Map<String, Object> data = parentHierarchy.elementData;
String id = (String) data.get(FIELD_ID);
String select = String.format(Locale.ROOT, "select * from %s where %s='%s'", type, type + "_parent_id", id);
List<Hierarchy> childHierarchies = createMockedIterator(type, select, atLeast(5), holder);
parentHierarchy.elements.addAll(childHierarchies);
result.addAll(childHierarchies);
}
return result;
}
private String createChildren(String parentName, int currentLevel, int maxLevel,
List<Hierarchy> parentData, ContextHolder holder) {
if (currentLevel == maxLevel) { //recursion base
return "";
}
int childrenNumber = atLeast(2);
StringBuilder builder = new StringBuilder();
for (int childIndex = 0; childIndex < childrenNumber; ++childIndex) {
String childName = parentName + "Child" + childIndex;
String fields = createFieldsList(FIELD_ID, "desc", "type_s");
String select = String.format(Locale.ROOT, "select * from %s where %s='%s'", childName, childName + "_parent_id", "${" + parentName + ".id}");
//for each child entity create several iterators
List<Hierarchy> childData = createMockedIterator(childName, parentData, holder);
String subChildren = createChildren(childName, currentLevel + 1, maxLevel, childData, holder);
String child = MessageFormat.format(childEntityTemplate, childName, select, fields, subChildren);
builder.append(child);
builder.append('\n');
}
return builder.toString();
}
private String createFieldsList(String... fields) {
StringBuilder builder = new StringBuilder();
for (String field: fields) {
String text = String.format(Locale.ROOT, "<field column='%s' />", field);
builder.append(text);
builder.append('\n');
}
return builder.toString();
}
private final String threeLevelHierarchyConfig = "<dataConfig>\n" +
" <dataSource type='MockDataSource' />\n" +
" <document>\n" +
" <entity name='PARENT' query='select * from PARENT'>\n" +
" <field column='id' />\n" +
" <field column='desc' />\n" +
" <field column='type_s' />\n" +
" <entity child='true' name='CHILD' query=\"select * from CHILD where parent_id='${PARENT.id}'\">\n" +
" <field column='id' />\n" +
" <field column='desc' />\n" +
" <field column='type_s' />\n" +
" <entity child='true' name='GRANDCHILD' query=\"select * from GRANDCHILD where parent_id='${CHILD.id}'\">\n" +
" <field column='id' />\n" +
" <field column='desc' />\n" +
" <field column='type_s' />\n" +
" </entity>\n" +
" </entity>\n" +
" </entity>\n" +
" </document>\n" +
"</dataConfig>";
/** {0} is rootEntity block **/
private final String dataConfigTemplate = "<dataConfig><dataSource type=\"MockDataSource\" />\n<document>\n {0}</document></dataConfig>";
/**
* {0} - entityName,
* {1} - select query
* {2} - fieldsList
* {3} - childEntitiesList
**/
private final String rootEntityTemplate = "<entity name=\"{0}\" query=\"{1}\">\n{2} {3}\n</entity>\n";
/**
* {0} - entityName,
* {1} - select query
* {2} - fieldsList
* {3} - childEntitiesList
**/
private final String childEntityTemplate = "<entity " + ConfigNameConstants.CHILD + "=\"true\" name=\"{0}\" query=\"{1}\">\n {2} {3} </entity>\n";
private BitDocIdSetFilter createParentFilter(String type) {
BooleanQuery parentQuery = new BooleanQuery();
parentQuery.add(new TermQuery(new Term("type_s", type)), Occur.MUST);
return new BitDocIdSetCachingWrapperFilter(new QueryWrapperFilter(parentQuery));
}
private String nextId() {
++id;
return String.valueOf(id);
}
}