mirror of https://github.com/apache/lucene.git
SOLR-5147 Support child documents in DIH
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1652360 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d5c52bc915
commit
e32c7b7db9
|
@ -307,6 +307,8 @@ New Features
|
|||
for Collapse and Expand (Joel Bernstein)
|
||||
|
||||
* SOLR-6937: In schemaless mode ,replace spaces and special characters with underscore (Noble Paul)
|
||||
|
||||
* SOLR-5147: Support child documents in DIH (Shawn Heisey, Thomas Champagne, Mikhail Khludnev via Noble Paul
|
||||
|
||||
Bug Fixes
|
||||
----------------------
|
||||
|
|
|
@ -492,15 +492,29 @@ public class DocBuilder {
|
|||
getDebugLogger().log(DIHLogLevels.ENTITY_OUT, epw.getEntity().getName(), arow);
|
||||
}
|
||||
importStatistics.rowsCount.incrementAndGet();
|
||||
|
||||
DocWrapper childDoc = null;
|
||||
if (doc != null) {
|
||||
handleSpecialCommands(arow, doc);
|
||||
addFields(epw.getEntity(), doc, arow, vr);
|
||||
if (epw.getEntity().isChild()) {
|
||||
childDoc = new DocWrapper();
|
||||
handleSpecialCommands(arow, childDoc);
|
||||
addFields(epw.getEntity(), childDoc, arow, vr);
|
||||
doc.addChildDocument(childDoc);
|
||||
} else {
|
||||
handleSpecialCommands(arow, doc);
|
||||
addFields(epw.getEntity(), doc, arow, vr);
|
||||
}
|
||||
}
|
||||
if (epw.getEntity().getChildren() != null) {
|
||||
vr.addNamespace(epw.getEntity().getName(), arow);
|
||||
for (EntityProcessorWrapper child : epw.getChildren()) {
|
||||
buildDocument(vr, doc,
|
||||
if (childDoc != null) {
|
||||
buildDocument(vr, childDoc,
|
||||
child.getEntity().isDocRoot() ? pk : null, child, false, ctx, entitiesToDestroy);
|
||||
} else {
|
||||
buildDocument(vr, doc,
|
||||
child.getEntity().isDocRoot() ? pk : null, child, false, ctx, entitiesToDestroy);
|
||||
}
|
||||
}
|
||||
vr.removeNamespace(epw.getEntity().getName());
|
||||
}
|
||||
|
|
|
@ -40,6 +40,8 @@ public class ConfigNameConstants {
|
|||
public static final String IMPORTER_NS_SHORT = "dih";
|
||||
|
||||
public static final String ROOT_ENTITY = "rootEntity";
|
||||
|
||||
public static final String CHILD = "child";
|
||||
|
||||
public static final String FUNCTION = "function";
|
||||
|
||||
|
|
|
@ -41,6 +41,7 @@ public class Entity {
|
|||
private final String processorName;
|
||||
private final Entity parentEntity;
|
||||
private final boolean docRoot;
|
||||
private final boolean child;
|
||||
private final List<Entity> children;
|
||||
private final List<EntityField> fields;
|
||||
private final Map<String,Set<EntityField>> colNameVsField;
|
||||
|
@ -77,6 +78,9 @@ public class Entity {
|
|||
docRoot = false;
|
||||
}
|
||||
|
||||
String childValue = ConfigParseUtil.getStringAttribute(element, ConfigNameConstants.CHILD, null);
|
||||
child = "true".equals(childValue);
|
||||
|
||||
Map<String,String> modAttributes = ConfigParseUtil
|
||||
.getAllAttributes(element);
|
||||
modAttributes.put(ConfigNameConstants.DATA_SRC, this.dataSourceName);
|
||||
|
@ -219,4 +223,8 @@ public class Entity {
|
|||
public List<Map<String,String>> getAllFieldsList() {
|
||||
return allFieldAttributes;
|
||||
}
|
||||
|
||||
public boolean isChild() {
|
||||
return child;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -43,6 +43,7 @@
|
|||
<field name="DO_NOT_INDEX" type="ignored" />
|
||||
|
||||
<field name="_version_" type="tlong" indexed="true" stored="true" multiValued="false"/>
|
||||
<field name="_root_" type="string" indexed="true" stored="true" multiValued="false"/>
|
||||
|
||||
<dynamicField name="*_i" type="tint" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_s" type="string" indexed="true" stored="true"/>
|
||||
|
|
|
@ -0,0 +1,391 @@
|
|||
package org.apache.solr.handler.dataimport;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.MessageFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.StoredDocument;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.QueryWrapperFilter;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.join.BitDocIdSetCachingWrapperFilter;
|
||||
import org.apache.lucene.search.join.BitDocIdSetFilter;
|
||||
import org.apache.lucene.search.join.ScoreMode;
|
||||
import org.apache.lucene.search.join.ToParentBlockJoinQuery;
|
||||
import org.apache.solr.handler.dataimport.config.ConfigNameConstants;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.search.SolrIndexSearcher;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
* Test for DocBuilder using the test harness.
|
||||
* <b> Documents are hierarchical in this test, i.e. each document have nested children documents.</b>
|
||||
*/
|
||||
public class TestHierarchicalDocBuilder extends AbstractDataImportHandlerTestCase {
|
||||
|
||||
private static final String FIELD_ID = "id";
|
||||
private int id = 0; //unique id
|
||||
private SolrQueryRequest req;
|
||||
|
||||
private static class ContextHolder {
|
||||
int counter = 0;
|
||||
List<Hierarchy> hierarchies = new ArrayList<Hierarchy>();
|
||||
}
|
||||
|
||||
private static class Hierarchy {
|
||||
String elementType;
|
||||
Map<String, Object> elementData = new HashMap<String,Object>();
|
||||
List<Hierarchy> elements = new ArrayList<Hierarchy>();
|
||||
}
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
initCore("dataimport-solrconfig.xml", "dataimport-schema.xml");
|
||||
}
|
||||
|
||||
@Before
|
||||
public void before() {
|
||||
req = req("*:*"); // don't really care about query
|
||||
MockDataSource.clearCache();
|
||||
}
|
||||
|
||||
@After
|
||||
public void after() {
|
||||
req.close();
|
||||
MockDataSource.clearCache();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testThreeLevelHierarchy() throws Exception {
|
||||
int parentsNum = 3; //fixed for simplicity of test
|
||||
int childrenNum = 0;
|
||||
int grandChildrenNum = 0;
|
||||
|
||||
final String parentType = "parent";
|
||||
final String childType = "child";
|
||||
final String grandChildType = "grand_child";
|
||||
|
||||
List<String> parentIds = createDataIterator("select * from PARENT", parentType, parentType, parentsNum);
|
||||
Collections.shuffle(parentIds);
|
||||
String parentId1 = parentIds.get(0);
|
||||
String parentId2 = parentIds.get(1);
|
||||
|
||||
//parent 1 children
|
||||
int firstParentChildrenNum = 3; //fixed for simplicity of test
|
||||
String select = "select * from CHILD where parent_id='" + parentId1 + "'";
|
||||
List<String> childrenIds = createDataIterator(select, childType, "child of first parent", firstParentChildrenNum);
|
||||
List<String> firstParentChildrenIds = new ArrayList<String>(childrenIds);
|
||||
childrenNum += childrenIds.size();
|
||||
|
||||
// grand children of first parent first child
|
||||
String childId = childrenIds.get(0);
|
||||
String description = "grandchild of first parent, child of " + childId + " child";
|
||||
select = "select * from GRANDCHILD where parent_id='" + childId + "'";
|
||||
List<String> grandChildrenIds = createDataIterator(select, grandChildType, description, atLeast(2));
|
||||
grandChildrenNum += grandChildrenIds.size();
|
||||
|
||||
// grand children of first parent second child
|
||||
childId = childrenIds.get(1);
|
||||
description = "grandchild of first parent, child of " + childId + " child";
|
||||
select = "select * from GRANDCHILD where parent_id='" + childId + "'";
|
||||
List<String> grandChildrenIds2 = createDataIterator(select, grandChildType, description, atLeast(2));
|
||||
grandChildrenNum += grandChildrenIds2.size();
|
||||
|
||||
grandChildrenIds.addAll(grandChildrenIds2);
|
||||
|
||||
// third children of first parent has no grand children
|
||||
|
||||
// parent 2 children (no grand children)
|
||||
select = "select * from CHILD where parent_id='" + parentId2 + "'";
|
||||
childrenIds = createDataIterator(select, childType, "child of second parent", atLeast(2));
|
||||
childrenNum += childrenIds.size();
|
||||
|
||||
// parent 3 has no children and grand children
|
||||
|
||||
int totalDocsNum = parentsNum + childrenNum + grandChildrenNum;
|
||||
|
||||
runFullImport(threeLevelHierarchyConfig);
|
||||
|
||||
assertTrue("Update request processor processAdd was not called", TestUpdateRequestProcessor.processAddCalled);
|
||||
assertTrue("Update request processor processCommit was not callled", TestUpdateRequestProcessor.processCommitCalled);
|
||||
assertTrue("Update request processor finish was not called", TestUpdateRequestProcessor.finishCalled);
|
||||
|
||||
// very simple asserts to check that we at least have correct num of docs indexed
|
||||
assertQ(req("*:*"), "//*[@numFound='" + totalDocsNum + "']");
|
||||
assertQ(req("type_s:parent"), "//*[@numFound='" + parentsNum + "']");
|
||||
assertQ(req("type_s:child"), "//*[@numFound='" + childrenNum + "']");
|
||||
assertQ(req("type_s:grand_child"), "//*[@numFound='" + grandChildrenNum + "']");
|
||||
|
||||
// let's check BlockJoin
|
||||
// get first parent by any grand children
|
||||
String randomGrandChildId = grandChildrenIds.get(random().nextInt(grandChildrenIds.size()));
|
||||
Query query = createToParentQuery(parentType, FIELD_ID, randomGrandChildId);
|
||||
assertSearch(query, FIELD_ID, parentId1);
|
||||
|
||||
// get first parent by any children
|
||||
String randomChildId = firstParentChildrenIds.get(random().nextInt(firstParentChildrenIds.size()));
|
||||
query = createToParentQuery(parentType, FIELD_ID, randomChildId);
|
||||
assertSearch(query, FIELD_ID, parentId1);
|
||||
|
||||
// get parent by children by grand children
|
||||
randomGrandChildId = grandChildrenIds.get(random().nextInt(grandChildrenIds.size()));
|
||||
ToParentBlockJoinQuery childBlockJoinQuery = createToParentQuery(childType, FIELD_ID, randomGrandChildId);
|
||||
ToParentBlockJoinQuery blockJoinQuery = new ToParentBlockJoinQuery(childBlockJoinQuery, createParentFilter(parentType), ScoreMode.Avg);
|
||||
assertSearch(blockJoinQuery, FIELD_ID, parentId1);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRandomDepthHierarchy() throws Exception {
|
||||
final String parentType = "parent";
|
||||
|
||||
int parentsNum = atLeast(5);
|
||||
int depth = atLeast(3);
|
||||
|
||||
ContextHolder holder = new ContextHolder();
|
||||
|
||||
String config = createRandomizedConfig(depth, parentType, parentsNum, holder);
|
||||
runFullImport(config);
|
||||
|
||||
assertTrue("Update request processor processAdd was not called", TestUpdateRequestProcessor.processAddCalled);
|
||||
assertTrue("Update request processor processCommit was not callled", TestUpdateRequestProcessor.processCommitCalled);
|
||||
assertTrue("Update request processor finish was not called", TestUpdateRequestProcessor.finishCalled);
|
||||
|
||||
assertQ(req("type_s:" + parentType), "//*[@numFound='" + parentsNum + "']");
|
||||
assertQ(req("-type_s:"+ parentType), "//*[@numFound='" + (holder.counter - parentsNum) + "']");
|
||||
|
||||
// let's check BlockJoin
|
||||
Hierarchy randomHierarchy = holder.hierarchies.get(random().nextInt(holder.hierarchies.size()));
|
||||
|
||||
Query deepestQuery = createBlockJoinQuery(randomHierarchy);
|
||||
assertSearch(deepestQuery, FIELD_ID, (String) randomHierarchy.elementData.get(FIELD_ID));
|
||||
}
|
||||
|
||||
private Query createBlockJoinQuery(Hierarchy hierarchy) {
|
||||
List<Hierarchy> elements = hierarchy.elements;
|
||||
if (elements.isEmpty()) {
|
||||
BooleanQuery childQuery = new BooleanQuery();
|
||||
childQuery.add(new TermQuery(new Term(FIELD_ID, (String) hierarchy.elementData.get(FIELD_ID))), Occur.MUST);
|
||||
return childQuery;
|
||||
}
|
||||
|
||||
Query childQuery = createBlockJoinQuery(elements.get(random().nextInt(elements.size())));
|
||||
return createToParentQuery(hierarchy.elementType, childQuery);
|
||||
}
|
||||
|
||||
private ToParentBlockJoinQuery createToParentQuery(String parentType, String childField, String childFieldValue) {
|
||||
BooleanQuery childQuery = new BooleanQuery();
|
||||
childQuery.add(new TermQuery(new Term(childField, childFieldValue)), Occur.MUST);
|
||||
ToParentBlockJoinQuery result = createToParentQuery(parentType, childQuery);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private ToParentBlockJoinQuery createToParentQuery(String parentType, Query childQuery) {
|
||||
ToParentBlockJoinQuery blockJoinQuery = new ToParentBlockJoinQuery(childQuery, createParentFilter(parentType), ScoreMode.Avg);
|
||||
|
||||
return blockJoinQuery;
|
||||
}
|
||||
|
||||
private void assertSearch(Query query, String field, String... values) throws IOException {
|
||||
/* The limit of search queue is doubled to catch the error in case when for some reason there are more docs than expected */
|
||||
SolrIndexSearcher searcher = req.getSearcher();
|
||||
TopDocs result = searcher.search(query, values.length * 2);
|
||||
assertEquals(values.length, result.totalHits);
|
||||
List<String> actualValues = new ArrayList<String>();
|
||||
for (int index = 0; index < values.length; ++index) {
|
||||
StoredDocument doc = searcher.doc(result.scoreDocs[index].doc);
|
||||
actualValues.add(doc.get(field));
|
||||
}
|
||||
|
||||
for (String expectedValue: values) {
|
||||
boolean removed = actualValues.remove(expectedValue);
|
||||
if (!removed) {
|
||||
fail("Search result does not contain expected values");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private List<String> createDataIterator(String query, String type, String description, int count) {
|
||||
List<Map<String, Object>> data = new ArrayList<Map<String, Object>>();
|
||||
List<String> ids = new ArrayList<String>(count);
|
||||
for (int index = 0; index < count; ++index) {
|
||||
String docId = nextId();
|
||||
ids.add(docId);
|
||||
Map<String, Object> doc = createMap(FIELD_ID, docId, "desc", docId + " " + description, "type_s", type);
|
||||
data.add(doc);
|
||||
}
|
||||
Collections.shuffle(data, random());
|
||||
MockDataSource.setIterator(query, data.iterator());
|
||||
|
||||
return ids;
|
||||
}
|
||||
|
||||
/** Internally configures MockDataSource **/
|
||||
private String createRandomizedConfig(int depth, String parentType, int parentsNum, ContextHolder holder) {
|
||||
List<Hierarchy> parentData = createMockedIterator(parentType, "SELECT * FROM " + parentType, parentsNum, holder);
|
||||
|
||||
// each map represents parent and each parent is root of separate hierarchy
|
||||
holder.hierarchies = parentData;
|
||||
|
||||
String children = createChildren(parentType, 0, depth, parentData, holder);
|
||||
|
||||
String rootFields = createFieldsList(FIELD_ID, "desc", "type_s");
|
||||
String rootEntity = MessageFormat.format(rootEntityTemplate, parentType, "SELECT * FROM " + parentType, rootFields, children);
|
||||
|
||||
String config = MessageFormat.format(dataConfigTemplate, rootEntity);
|
||||
return config;
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private List<Hierarchy> createMockedIterator(String type, String query, int amount, ContextHolder holder) {
|
||||
List<Hierarchy> hierarchies = new ArrayList<Hierarchy>();
|
||||
List<Map<String, Object>> data = new ArrayList<Map<String, Object>>();
|
||||
for (int index = 0; index < amount; ++index) {
|
||||
holder.counter++;
|
||||
String idStr = String.valueOf(holder.counter);
|
||||
Map<String, Object> element = createMap(FIELD_ID, idStr, "desc", type + "_" + holder.counter, "type_s", type);
|
||||
data.add(element);
|
||||
|
||||
Hierarchy hierarchy = new Hierarchy();
|
||||
hierarchy.elementType = type;
|
||||
hierarchy.elementData = element;
|
||||
hierarchies.add(hierarchy);
|
||||
}
|
||||
|
||||
MockDataSource.setIterator(query, data.iterator());
|
||||
|
||||
return hierarchies;
|
||||
}
|
||||
|
||||
private List<Hierarchy> createMockedIterator(String type, List<Hierarchy> parentData, ContextHolder holder) {
|
||||
List<Hierarchy> result = new ArrayList<Hierarchy>();
|
||||
for (Hierarchy parentHierarchy: parentData) {
|
||||
Map<String, Object> data = parentHierarchy.elementData;
|
||||
String id = (String) data.get(FIELD_ID);
|
||||
String select = String.format(Locale.ROOT, "select * from %s where %s='%s'", type, type + "_parent_id", id);
|
||||
List<Hierarchy> childHierarchies = createMockedIterator(type, select, atLeast(5), holder);
|
||||
parentHierarchy.elements.addAll(childHierarchies);
|
||||
result.addAll(childHierarchies);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private String createChildren(String parentName, int currentLevel, int maxLevel,
|
||||
List<Hierarchy> parentData, ContextHolder holder) {
|
||||
|
||||
if (currentLevel == maxLevel) { //recursion base
|
||||
return "";
|
||||
}
|
||||
|
||||
int childrenNumber = atLeast(2);
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (int childIndex = 0; childIndex < childrenNumber; ++childIndex) {
|
||||
String childName = parentName + "Child" + childIndex;
|
||||
String fields = createFieldsList(FIELD_ID, "desc", "type_s");
|
||||
String select = String.format(Locale.ROOT, "select * from %s where %s='%s'", childName, childName + "_parent_id", "${" + parentName + ".id}");
|
||||
|
||||
//for each child entity create several iterators
|
||||
List<Hierarchy> childData = createMockedIterator(childName, parentData, holder);
|
||||
|
||||
String subChildren = createChildren(childName, currentLevel + 1, maxLevel, childData, holder);
|
||||
String child = MessageFormat.format(childEntityTemplate, childName, select, fields, subChildren);
|
||||
builder.append(child);
|
||||
builder.append('\n');
|
||||
}
|
||||
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
private String createFieldsList(String... fields) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (String field: fields) {
|
||||
String text = String.format(Locale.ROOT, "<field column='%s' />", field);
|
||||
builder.append(text);
|
||||
builder.append('\n');
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
private final String threeLevelHierarchyConfig = "<dataConfig>\n" +
|
||||
" <dataSource type='MockDataSource' />\n" +
|
||||
" <document>\n" +
|
||||
" <entity name='PARENT' query='select * from PARENT'>\n" +
|
||||
" <field column='id' />\n" +
|
||||
" <field column='desc' />\n" +
|
||||
" <field column='type_s' />\n" +
|
||||
" <entity child='true' name='CHILD' query=\"select * from CHILD where parent_id='${PARENT.id}'\">\n" +
|
||||
" <field column='id' />\n" +
|
||||
" <field column='desc' />\n" +
|
||||
" <field column='type_s' />\n" +
|
||||
" <entity child='true' name='GRANDCHILD' query=\"select * from GRANDCHILD where parent_id='${CHILD.id}'\">\n" +
|
||||
" <field column='id' />\n" +
|
||||
" <field column='desc' />\n" +
|
||||
" <field column='type_s' />\n" +
|
||||
" </entity>\n" +
|
||||
" </entity>\n" +
|
||||
" </entity>\n" +
|
||||
" </document>\n" +
|
||||
"</dataConfig>";
|
||||
|
||||
/** {0} is rootEntity block **/
|
||||
private final String dataConfigTemplate = "<dataConfig><dataSource type=\"MockDataSource\" />\n<document>\n {0}</document></dataConfig>";
|
||||
|
||||
/**
|
||||
* {0} - entityName,
|
||||
* {1} - select query
|
||||
* {2} - fieldsList
|
||||
* {3} - childEntitiesList
|
||||
**/
|
||||
private final String rootEntityTemplate = "<entity name=\"{0}\" query=\"{1}\">\n{2} {3}\n</entity>\n";
|
||||
|
||||
/**
|
||||
* {0} - entityName,
|
||||
* {1} - select query
|
||||
* {2} - fieldsList
|
||||
* {3} - childEntitiesList
|
||||
**/
|
||||
private final String childEntityTemplate = "<entity " + ConfigNameConstants.CHILD + "=\"true\" name=\"{0}\" query=\"{1}\">\n {2} {3} </entity>\n";
|
||||
|
||||
private BitDocIdSetFilter createParentFilter(String type) {
|
||||
BooleanQuery parentQuery = new BooleanQuery();
|
||||
parentQuery.add(new TermQuery(new Term("type_s", type)), Occur.MUST);
|
||||
return new BitDocIdSetCachingWrapperFilter(new QueryWrapperFilter(parentQuery));
|
||||
}
|
||||
|
||||
private String nextId() {
|
||||
++id;
|
||||
return String.valueOf(id);
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue