discourse/lib/onebox/engine/wikipedia_onebox.rb

# frozen_string_literal: true

module Onebox
  module Engine
    class WikipediaOnebox
      include Engine
      include LayoutSupport
      include HTML

      matches_regexp(%r{^https?://.*\.wikipedia\.(com|org)})
      always_https

      private

      def data
        paras = []
        text = ""

        # Detect section Hash in the url and retrive the related paragraphs. if no hash provided the first few paragraphs will be used
        # Author Lidlanca
        # Date 9/8/2014
        if (m_url_hash = @url.match(%r{#([^/?]+)})) # extract url hash
          m_url_hash_name = m_url_hash[1]
        end

        if m_url_hash.nil? # no hash found in url
          paras = raw.search("p") # default get all the paras
        else
          section_header_title = raw.xpath("//span[@id='#{CGI.unescape(m_url_hash_name)}']")

          if section_header_title.empty?
            paras = raw.search("p") # default get all the paras
          else
            section_title_text = section_header_title.inner_text
            section_header = section_header_title[0].parent # parent element of the section span element should be an <h3> node
            cur_element = section_header

            # p|text|div covers the general case. We assume presence of at least 1 P node. if section has no P node we may end up with a P node from the next section.
            # div tag is commonly used as an assets wraper in an article section. often as the first element holding an image.
            # ul support will imporve the output generated for a section with a list as the main content (for example: an Author Bibliography, A musician Discography, etc)
            first_p_found = nil
            while (
                    ((next_sibling = cur_element.next_sibling).name =~ /p|text|div|ul/) ||
                      first_p_found.nil?
                  )
              # from section header get the next sibling until it is a breaker tag
              cur_element = next_sibling
              if (cur_element.name == "p" || cur_element.name == "ul") #we treat a list as we detect a p to avoid showing
                first_p_found = true
                paras.push(cur_element)
              end
            end
          end
        end

        unless paras.empty?
          cnt = 0
          while text.length < Onebox::LayoutSupport.max_text && cnt <= 3
            break if cnt >= paras.size
            text += " " unless cnt == 0

            if paras[cnt].name == "ul" # Handle UL tag. Generate a textual ordered list (1.item | 2.item | 3.item). Unfortunately no newline allowed in output
              li_index = 1
              list_items = []
              paras[cnt]
                .children
                .css("li")
                .each do |li|
                  list_items.push "#{li_index}." + li.inner_text
                  li_index += 1
                end
              paragraph = (list_items.join " |\n ")[0..Onebox::LayoutSupport.max_text]
            else
              paragraph = paras[cnt].inner_text[0..Onebox::LayoutSupport.max_text]
            end

            paragraph.gsub!(/\[\d+\]/mi, "")
            text += paragraph
            cnt += 1
          end
        end

        text = "#{text[0..Onebox::LayoutSupport.max_text]}..." if text.length >
          Onebox::LayoutSupport.max_text

        result = {
          link: link,
          title:
            raw.css("html body h1").inner_text +
              (section_title_text ? " | " + section_title_text : ""), #if a section sub title exists add it to the main article title
          description: text,
        }

        img = raw.css(".image img")

        if img && img.size > 0
          img.each do |i|
            src = i["src"]
            if src !~ /Question_book/
              result[:image] = src
              break
            end
          end
        end

        result
      end
    end
  end
end
DEV: Absorb onebox gem into core (#12979) * Move onebox gem in core library * Update template file path * Remove warning for onebox gem caching * Remove onebox version file * Remove onebox gem * Add sanitize gem * Require onebox library in lazy-yt plugin * Remove onebox web specific code This code was used in standalone onebox Sinatra application * Merge Discourse specific AllowlistedGenericOnebox engine in core * Fix onebox engine filenames to match class name casing * Move onebox specs from gem into core * DEV: Rename `response` helper to `onebox_response` Fixes a naming collision. * Require rails_helper * Don't use `before/after(:all)` * Whitespace * Remove fakeweb * Remove poor unit tests * DEV: Re-add fakeweb, plugins are using it * Move onebox helpers * Stub Instagram API * FIX: Follow additional redirect status codes (#476) Don’t throw errors if we encounter 303, 307 or 308 HTTP status codes in responses * Remove an empty file * DEV: Update the license file Using the copy from https://choosealicense.com/licenses/gpl-2.0/# Hopefully this will enable GitHub to show the license UI? * DEV: Update embedded copyrights * DEV: Add Onebox copyright notice * DEV: Add MIT license, convert COPYRIGHT.txt to md * DEV: Remove an incorrect copyright claim Co-authored-by: Jarek Radosz <jradosz@gmail.com> Co-authored-by: jbrw <jamie@goatforce5.org> 2021-05-26 05:41:35 -04:00			`# frozen_string_literal: true`

			`module Onebox`
			`module Engine`
			`class WikipediaOnebox`
			`include Engine`
			`include LayoutSupport`
			`include HTML`

			`matches_regexp(%r{^https?://.*\.wikipedia\.(com\|org)})`
			`always_https`

			`private`

			`def data`
			`paras = []`
			`text = ""`

			`# Detect section Hash in the url and retrive the related paragraphs. if no hash provided the first few paragraphs will be used`
			`# Author Lidlanca`
			`# Date 9/8/2014`
			`if (m_url_hash = @url.match(%r{#([^/?]+)})) # extract url hash`
			`m_url_hash_name = m_url_hash[1]`
			`end`

DEV: Enable `unless` cops We discussed the use of `unless` internally and decided to enforce available rules from rubocop to restrict its most problematic uses. 2023-02-16 04:40:11 -05:00			`if m_url_hash.nil? # no hash found in url`
			`paras = raw.search("p") # default get all the paras`
			`else`
FIX: Unescapes hash section with present to account for url-encoded chars Sections with unreserverd characters will appear url-encoded and need to be unescaped before using it. Wikipedia generates 2 different spans in this case in the same page, one with an id resulting of replacing the % symbols with . and the other with the decoded version of the string. For example, for /wiki/foo#A%C3%A1A it will generate: <span id="A.C3.A1A"></span> <span id="AáA">AáA</span> Unescaping the `m_url_hash_name` should work in all cases to target the proper section span. 2021-08-11 16:56:55 -04:00			`section_header_title = raw.xpath("//span[@id='#{CGI.unescape(m_url_hash_name)}']")`
DEV: Absorb onebox gem into core (#12979) * Move onebox gem in core library * Update template file path * Remove warning for onebox gem caching * Remove onebox version file * Remove onebox gem * Add sanitize gem * Require onebox library in lazy-yt plugin * Remove onebox web specific code This code was used in standalone onebox Sinatra application * Merge Discourse specific AllowlistedGenericOnebox engine in core * Fix onebox engine filenames to match class name casing * Move onebox specs from gem into core * DEV: Rename `response` helper to `onebox_response` Fixes a naming collision. * Require rails_helper * Don't use `before/after(:all)` * Whitespace * Remove fakeweb * Remove poor unit tests * DEV: Re-add fakeweb, plugins are using it * Move onebox helpers * Stub Instagram API * FIX: Follow additional redirect status codes (#476) Don’t throw errors if we encounter 303, 307 or 308 HTTP status codes in responses * Remove an empty file * DEV: Update the license file Using the copy from https://choosealicense.com/licenses/gpl-2.0/# Hopefully this will enable GitHub to show the license UI? * DEV: Update embedded copyrights * DEV: Add Onebox copyright notice * DEV: Add MIT license, convert COPYRIGHT.txt to md * DEV: Remove an incorrect copyright claim Co-authored-by: Jarek Radosz <jradosz@gmail.com> Co-authored-by: jbrw <jamie@goatforce5.org> 2021-05-26 05:41:35 -04:00
			`if section_header_title.empty?`
			`paras = raw.search("p") # default get all the paras`
			`else`
			`section_title_text = section_header_title.inner_text`
			`section_header = section_header_title[0].parent # parent element of the section span element should be an <h3> node`
			`cur_element = section_header`

			`# p\|text\|div covers the general case. We assume presence of at least 1 P node. if section has no P node we may end up with a P node from the next section.`
			`# div tag is commonly used as an assets wraper in an article section. often as the first element holding an image.`
			`# ul support will imporve the output generated for a section with a list as the main content (for example: an Author Bibliography, A musician Discography, etc)`
			`first_p_found = nil`
			`while (`
			`((next_sibling = cur_element.next_sibling).name =~ /p\|text\|div\|ul/) \|\|`
			`first_p_found.nil?`
			`)`
			`# from section header get the next sibling until it is a breaker tag`
			`cur_element = next_sibling`
			`if (cur_element.name == "p" \|\| cur_element.name == "ul") #we treat a list as we detect a p to avoid showing`
			`first_p_found = true`
			`paras.push(cur_element)`
			`end`
			`end`
			`end`
			`end`

			`unless paras.empty?`
			`cnt = 0`
			`while text.length < Onebox::LayoutSupport.max_text && cnt <= 3`
			`break if cnt >= paras.size`
			`text += " " unless cnt == 0`

			`if paras[cnt].name == "ul" # Handle UL tag. Generate a textual ordered list (1.item \| 2.item \| 3.item). Unfortunately no newline allowed in output`
			`li_index = 1`
			`list_items = []`
			`paras[cnt]`
			`.children`
			`.css("li")`
			`.each do \|li\|`
			`list_items.push "#{li_index}." + li.inner_text`
			`li_index += 1`
DEV: Apply syntax_tree formatting to `lib/*` 2023-01-09 07:10:19 -05:00			`end`
DEV: Absorb onebox gem into core (#12979) * Move onebox gem in core library * Update template file path * Remove warning for onebox gem caching * Remove onebox version file * Remove onebox gem * Add sanitize gem * Require onebox library in lazy-yt plugin * Remove onebox web specific code This code was used in standalone onebox Sinatra application * Merge Discourse specific AllowlistedGenericOnebox engine in core * Fix onebox engine filenames to match class name casing * Move onebox specs from gem into core * DEV: Rename `response` helper to `onebox_response` Fixes a naming collision. * Require rails_helper * Don't use `before/after(:all)` * Whitespace * Remove fakeweb * Remove poor unit tests * DEV: Re-add fakeweb, plugins are using it * Move onebox helpers * Stub Instagram API * FIX: Follow additional redirect status codes (#476) Don’t throw errors if we encounter 303, 307 or 308 HTTP status codes in responses * Remove an empty file * DEV: Update the license file Using the copy from https://choosealicense.com/licenses/gpl-2.0/# Hopefully this will enable GitHub to show the license UI? * DEV: Update embedded copyrights * DEV: Add Onebox copyright notice * DEV: Add MIT license, convert COPYRIGHT.txt to md * DEV: Remove an incorrect copyright claim Co-authored-by: Jarek Radosz <jradosz@gmail.com> Co-authored-by: jbrw <jamie@goatforce5.org> 2021-05-26 05:41:35 -04:00			`paragraph = (list_items.join " \|\n ")[0..Onebox::LayoutSupport.max_text]`
			`else`
			`paragraph = paras[cnt].inner_text[0..Onebox::LayoutSupport.max_text]`
			`end`

			`paragraph.gsub!(/\[\d+\]/mi, "")`
			`text += paragraph`
			`cnt += 1`
			`end`
			`end`

			`text = "#{text[0..Onebox::LayoutSupport.max_text]}..." if text.length >`
			`Onebox::LayoutSupport.max_text`

			`result = {`
			`link: link,`
			`title:`
			`raw.css("html body h1").inner_text +`
			`(section_title_text ? " \| " + section_title_text : ""), #if a section sub title exists add it to the main article title`
			`description: text,`
			`}`

			`img = raw.css(".image img")`

			`if img && img.size > 0`
			`img.each do \|i\|`
			`src = i["src"]`
			`if src !~ /Question_book/`
			`result[:image] = src`
			`break`
			`end`
			`end`
			`end`

			`result`
			`end`
			`end`
			`end`
			`end`