lucene/client/ruby/solr-ruby/examples/tang/tang_importer.rb

58 lines
2.0 KiB
Ruby
Executable File

# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
require 'hpricot'
require 'solr'
solr_url = ENV["SOLR_URL"] || "http://localhost:8983/solr"
debug = ARGV[1] == "-debug"
solr = Solr::Connection.new(solr_url)
html = Hpricot(open(ARGV[0]))
max = 320
def next_blockquote(elem)
elem = elem.next_sibling
until elem.name == "blockquote" do
elem = elem.next_sibling
end
elem
end
for current_index in (1..max) do
section_start = html.at("//blockquote[text()='#{format('%03d',current_index)}']")
type_zh = next_blockquote(section_start)
author_zh = next_blockquote(type_zh)
title_zh = next_blockquote(author_zh)
body_zh = next_blockquote(title_zh)
type_en = next_blockquote(body_zh)
author_en = next_blockquote(type_en)
title_en = next_blockquote(author_en)
body_en = next_blockquote(title_en)
doc = {:type_zh_facet => type_zh, :author_zh_facet => author_zh, :title_zh_text => title_zh, :body_zh_text => body_zh,
:type_en_facet => type_en, :author_en_facet => author_en, :title_en_text => title_en, :body_en_text => body_en
}
doc.each {|k,v| doc[k] = v.inner_text}
doc[:id] = current_index # TODO: namespace the id, something like "etext_tang:#{current_index}"
doc[:source_facet] = 'etext_tang'
doc[:language_facet] = ['chi','eng']
puts "----",doc[:id],doc[:title_en_text],doc[:author_en_facet],doc[:type_en_facet]
# puts doc.inspect if debug
solr.add doc unless debug
end
solr.commit unless debug
#solr.optimize unless debug