discourse/lib/oneboxer/wikipedia_onebox.rb

require_dependency 'oneboxer/handlebars_onebox'

module Oneboxer
  class WikipediaOnebox < HandlebarsOnebox

    matcher /^https?:\/\/.*wikipedia\.(com|org)\/.*$/
    favicon 'wikipedia.png'

    def template
      template_path('simple_onebox')
    end

    def translate_url
      m = @url.match(/^https?:\/\/((?<subdomain>.+)\.)?wikipedia\.(com|org)\/wiki\/(?<identifier>[^#\/]+)/mi)
      subdomain = m[:subdomain] || "en"
      article_id = CGI::unescape(m[:identifier])
      "http://#{subdomain}.m.wikipedia.org/w/index.php?title=#{URI::encode(article_id)}"
    end

    def parse(data)

      html_doc = Nokogiri::HTML(data)

      result = {}

      title = html_doc.at('title').inner_html
      result[:title] = title.gsub!(/ - Wikipedia.*$/, '') if title.present?

      # get the first image > 150 pix high
      images = html_doc.search("img").select { |img| img['height'].to_i > 150 }

      result[:image] = "http:#{images[0]["src"]}" unless images.empty?

      # remove the table from mobile layout, as it can contain paras in some rare cases
      html_doc.search("table").remove

      # get all the paras
      paras = html_doc.search("p")
      text = ""

      unless paras.empty?
        cnt = 0
        while text.length < MAX_TEXT && cnt <= 3
          text << " " unless cnt == 0
          paragraph = paras[cnt].inner_text[0..MAX_TEXT]
          paragraph.gsub!(/\[\d+\]/mi, "")
          text << paragraph
          cnt += 1
        end
      end

      text = "#{text[0..MAX_TEXT]}..." if text.length > MAX_TEXT
      result[:text] = text
      result
    end

  end
end
Initial release of Discourse 2013-02-05 14:16:51 -05:00			`require_dependency 'oneboxer/handlebars_onebox'`

			`module Oneboxer`
			`class WikipediaOnebox < HandlebarsOnebox`

globalized wikipedia onebox 2013-03-06 21:30:40 -05:00			`matcher /^https?:\/\/.wikipedia\.(com\|org)\/.$/`
Initial release of Discourse 2013-02-05 14:16:51 -05:00			`favicon 'wikipedia.png'`

			`def template`
			`template_path('simple_onebox')`
			`end`

			`def translate_url`
globalized wikipedia onebox 2013-03-06 21:30:40 -05:00			`m = @url.match(/^https?:\/\/((?<subdomain>.+)\.)?wikipedia\.(com\|org)\/wiki\/(?<identifier>[^#\/]+)/mi)`
			`subdomain = m[:subdomain] \|\| "en"`
Initial release of Discourse 2013-02-05 14:16:51 -05:00			`article_id = CGI::unescape(m[:identifier])`
globalized wikipedia onebox 2013-03-06 21:30:40 -05:00			`"http://#{subdomain}.m.wikipedia.org/w/index.php?title=#{URI::encode(article_id)}"`
Initial release of Discourse 2013-02-05 14:16:51 -05:00			`end`

			`def parse(data)`

Replace Hpricot with Nokogiri 2013-02-12 09:46:45 -05:00			`html_doc = Nokogiri::HTML(data)`
Initial release of Discourse 2013-02-05 14:16:51 -05:00
			`result = {}`

Replace Hpricot with Nokogiri 2013-02-12 09:46:45 -05:00			`title = html_doc.at('title').inner_html`
globalized wikipedia onebox 2013-03-06 21:30:40 -05:00			`result[:title] = title.gsub!(/ - Wikipedia.*$/, '') if title.present?`
Initial release of Discourse 2013-02-05 14:16:51 -05:00
			`# get the first image > 150 pix high`
Replace Hpricot with Nokogiri 2013-02-12 09:46:45 -05:00			`images = html_doc.search("img").select { \|img\| img['height'].to_i > 150 }`
remove trailing whitespaces :heart: 2013-02-25 11:42:20 -05:00
Initial release of Discourse 2013-02-05 14:16:51 -05:00			`result[:image] = "http:#{images[0]["src"]}" unless images.empty?`

			`# remove the table from mobile layout, as it can contain paras in some rare cases`
Replace Hpricot with Nokogiri 2013-02-12 09:46:45 -05:00			`html_doc.search("table").remove`
Initial release of Discourse 2013-02-05 14:16:51 -05:00
			`# get all the paras`
Replace Hpricot with Nokogiri 2013-02-12 09:46:45 -05:00			`paras = html_doc.search("p")`
Initial release of Discourse 2013-02-05 14:16:51 -05:00			`text = ""`

			`unless paras.empty?`
			`cnt = 0`
enforce coding convention replaced every `and` by `&&` and every `or` by `\|\|` 2013-03-04 19:42:44 -05:00			`while text.length < MAX_TEXT && cnt <= 3`
Initial release of Discourse 2013-02-05 14:16:51 -05:00			`text << " " unless cnt == 0`
			`paragraph = paras[cnt].inner_text[0..MAX_TEXT]`
			`paragraph.gsub!(/\[\d+\]/mi, "")`
			`text << paragraph`
			`cnt += 1`
			`end`
			`end`

			`text = "#{text[0..MAX_TEXT]}..." if text.length > MAX_TEXT`
			`result[:text] = text`
			`result`
			`end`

			`end`
			`end`