The beginnings of a general purpose importer / data mapper in order to feed data into Solr

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@522416 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Erik Hatcher 2007-03-26 04:47:56 +00:00
parent 1da74f6678
commit 6016746b57
8 changed files with 238 additions and 1 deletions

View File

@ -16,4 +16,6 @@ require 'solr/request'
require 'solr/connection'
require 'solr/response'
require 'solr/util'
require 'solr/xml'
require 'solr/xml'
require 'solr/importer'
require 'solr/indexer'

View File

@ -0,0 +1,15 @@
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
module Solr; module Importer; end; end
require 'solr/importer/mapper'
require 'solr/importer/tab_delimited_file_source'

View File

@ -0,0 +1,48 @@
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# For files with the first line containing field names
class Solr::Importer::Mapper
def initialize(mapping)
@mapping = mapping
end
def field_data(orig_data, field_name)
case field_name
when Symbol
orig_data[field_name]
else
field_name
end
end
def map(orig_data)
mapped_data = {}
@mapping.each do |solr_name, field_mapping|
value = case field_mapping
when Proc
field_mapping.call(orig_data)
when String, Symbol
field_data(orig_data, field_mapping)
when Enumerable
field_mapping.collect {|orig_field_name| field_data(orig_data, orig_field_name)}.flatten
else
raise "Unknown mapping for #{solr_name}: #{field_mapping}"
end
mapped_data[solr_name] = value if value
end
mapped_data
end
end

View File

@ -0,0 +1,37 @@
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# For files with the first line containing field names
# Currently not designed for enormous files, as all lines are
# read into an array
class Solr::Importer::TabDelimitedFileSource
include Enumerable
def initialize(filename)
@filename = filename
end
def each
lines = IO.readlines(@filename)
headers = lines[0].split("\t").collect{|h| h.chomp}
lines[1..-1].each do |line|
data = headers.zip(line.split("\t").collect{|s| s.chomp})
def data.[](key)
self.assoc(key.to_s)[1]
end
yield(data)
end
end
end

View File

@ -0,0 +1,29 @@
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
class Solr::Indexer
def self.index(data_source, mapper, options={})
solr_url = options[:solr_url] || ENV["SOLR_URL"] || "http://localhost:8983/solr"
solr = Solr::Connection.new(solr_url)
data_source.each do |record|
document = mapper.map(record)
# yield(document) if block_given? # TODO: does yielding add value here? possibly to allow the caller
# to manipulate the document outside of mappings just before indexing?
solr.add(document) unless options[:debug]
puts document.inspect if options[:debug]
end
solr.commit unless options[:debug]
end
end

View File

@ -0,0 +1,75 @@
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
require 'solr'
require 'test/unit'
class DataMapperTest < Test::Unit::TestCase
def test_static_mapping
mapping = {:static => "value",
:static_array => ["value1", "value2"]}
mapper = Solr::Importer::Mapper.new(mapping)
mapped_data = mapper.map({})
assert_equal "value", mapped_data[:static]
assert_equal ["value1", "value2"], mapped_data[:static_array]
end
def test_simple_mapping
orig_data = {:orig_field => "value",
:multi1 => "val1", :multi2 => "val2"}
mapping = {:solr_field => :orig_field,
:mapped_array => [:multi1, :multi2], }
mapper = Solr::Importer::Mapper.new(mapping)
mapped_data = mapper.map(orig_data)
assert_equal "value", mapped_data[:solr_field]
assert_equal ["val1", "val2"], mapped_data[:mapped_array]
end
def test_proc
orig_data = {:orig_field => "value"}
mapping = {:solr_field => Proc.new {|record| ">#{record[:orig_field]}<"}}
mapper = Solr::Importer::Mapper.new(mapping)
mapped_data = mapper.map(orig_data)
assert_equal ">value<", mapped_data[:solr_field]
end
def test_overridden_field
mapping = {:solr_field => [:orig_field1, :orig_field2]}
orig_data = {:orig_field1 => "value1", :orig_field2 => "value2", }
mapper = Solr::Importer::Mapper.new(mapping)
def mapper.field_data(orig_data, field_name)
["~#{super(orig_data, field_name)}~"] # array tests that the value is flattened
end
mapped_data = mapper.map(orig_data)
assert_equal ["~value1~", "~value2~"], mapped_data[:solr_field]
end
def test_unknown_mapping
mapping = {:solr_field => /foo/} # regexp currently not a valid mapping type
mapper = Solr::Importer::Mapper.new(mapping)
assert_raise(RuntimeError) do
mapped_data = mapper.map({})
end
end
end

View File

@ -0,0 +1,2 @@
medium associatedURL boxHeightInInches boxLengthInInches boxWeightInPounds boxWidthInInches scannednumber upc asin country title fullTitle series numberInSeries edition aspect mediacount genre price currentValue language netrating description owner publisher published rare purchaseDate rating used signed hasExperienced notes location paid condition notowned author illustrator pages
book 9780865681743 0865681740 us Xing Yi Nei Gong: Xing Yi Health Maintenance and Internal Strength Development Xing Yi Nei Gong: Xing Yi Health Maintenance and Internal Strength Development Paperback $21.95 $14.05 4.5 This is the most complete book on the art of xing yi (hsing Yi) available. It includes the complete xing yi history and lineage going back eight generations; manuscripts handed down from famous practitioners Dai Long Bang and Li Neng Ran; 16 health maintenance and power development exercises; qigong (chi kung) exerices; xing yi long spear power training exercises; and more. Unique Publications 1998-02-10 12:00:00 +0000 2007-02-03 02:22:25 -0500 Dan Miller/ Tim Cartmell 200

View File

@ -0,0 +1,29 @@
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
require 'solr'
require 'test/unit'
class TabDelimitedFileSourceTest < Test::Unit::TestCase
def test_load
filename = File.expand_path(File.dirname(__FILE__)) + "/tab_delimited.txt"
source = Solr::Importer::TabDelimitedFileSource.new(filename)
assert_equal source.to_a.size, 1
source.each do |data|
assert_equal data[:asin], '0865681740'
end
end
end