discourse/lib/oneboxer/wikipedia_onebox.rb

59 lines
1.6 KiB
Ruby
Raw Normal View History

2013-02-05 14:16:51 -05:00
require_dependency 'oneboxer/handlebars_onebox'
module Oneboxer
class WikipediaOnebox < HandlebarsOnebox
2013-03-06 21:30:40 -05:00
matcher /^https?:\/\/.*wikipedia\.(com|org)\/.*$/
2013-02-05 14:16:51 -05:00
favicon 'wikipedia.png'
def template
template_path('simple_onebox')
end
def translate_url
2013-03-06 21:30:40 -05:00
m = @url.match(/^https?:\/\/((?<subdomain>.+)\.)?wikipedia\.(com|org)\/wiki\/(?<identifier>[^#\/]+)/mi)
subdomain = m[:subdomain] || "en"
2013-02-05 14:16:51 -05:00
article_id = CGI::unescape(m[:identifier])
2013-03-06 21:30:40 -05:00
"http://#{subdomain}.m.wikipedia.org/w/index.php?title=#{URI::encode(article_id)}"
2013-02-05 14:16:51 -05:00
end
def parse(data)
2013-02-12 09:46:45 -05:00
html_doc = Nokogiri::HTML(data)
2013-02-05 14:16:51 -05:00
result = {}
2013-02-12 09:46:45 -05:00
title = html_doc.at('title').inner_html
2013-03-06 21:30:40 -05:00
result[:title] = title.gsub!(/ - Wikipedia.*$/, '') if title.present?
2013-02-05 14:16:51 -05:00
# get the first image > 150 pix high
2013-02-12 09:46:45 -05:00
images = html_doc.search("img").select { |img| img['height'].to_i > 150 }
2013-02-25 11:42:20 -05:00
2013-02-05 14:16:51 -05:00
result[:image] = "http:#{images[0]["src"]}" unless images.empty?
# remove the table from mobile layout, as it can contain paras in some rare cases
2013-02-12 09:46:45 -05:00
html_doc.search("table").remove
2013-02-05 14:16:51 -05:00
# get all the paras
2013-02-12 09:46:45 -05:00
paras = html_doc.search("p")
2013-02-05 14:16:51 -05:00
text = ""
unless paras.empty?
cnt = 0
while text.length < MAX_TEXT && cnt <= 3
2013-02-05 14:16:51 -05:00
text << " " unless cnt == 0
paragraph = paras[cnt].inner_text[0..MAX_TEXT]
paragraph.gsub!(/\[\d+\]/mi, "")
text << paragraph
cnt += 1
end
end
text = "#{text[0..MAX_TEXT]}..." if text.length > MAX_TEXT
result[:text] = text
result
end
end
end