From 29ddb3a611806b6fdaac5d1aea7ca63d57bf03ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9gis=20Hanol?= Date: Mon, 24 Apr 2017 22:03:12 +0200 Subject: [PATCH] update JIVE API importer --- script/import_scripts/jive_api.rb | 72 ++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 24 deletions(-) diff --git a/script/import_scripts/jive_api.rb b/script/import_scripts/jive_api.rb index eec8dac7611..f270bf51c19 100644 --- a/script/import_scripts/jive_api.rb +++ b/script/import_scripts/jive_api.rb @@ -1,9 +1,12 @@ +require "nokogiri" require "htmlentities" +require_relative "./../../lib/html_to_markdown.rb" require File.expand_path(File.dirname(__FILE__) + "/base.rb") class ImportScripts::JiveApi < ImportScripts::Base - COUNT ||= 100 + USER_COUNT ||= 1000 + POST_COUNT ||= 100 STAFF_GUARDIAN ||= Guardian.new(Discourse.system_user) def initialize @@ -26,10 +29,10 @@ class ImportScripts::JiveApi < ImportScripts::Base puts "", "importing users..." imported_users = 0 - start_index = [0, Math.floor(PostCustomField.where(name: "import_id").count / COUNT.to_f) - COUNT].max + start_index = [0, UserCustomField.where(name: "import_id").count - USER_COUNT].max loop do - users = get("people/@all?fields=initialLogin,emails,displayName,mentionName,thumbnailUrl,-resources&count=#{COUNT}&startIndex=#{start_index}", true) + users = get("people/@all?fields=initialLogin,emails,displayName,mentionName,thumbnailUrl,-resources&count=#{USER_COUNT}&startIndex=#{start_index}", true) create_users(users["list"], offset: imported_users) do |user| { id: user["id"], @@ -41,7 +44,7 @@ class ImportScripts::JiveApi < ImportScripts::Base } end - break if users["list"].size < COUNT || users["links"].blank? || users["links"]["next"].blank? + break if users["list"].size < USER_COUNT || users["links"].blank? || users["links"]["next"].blank? imported_users += users["list"].size break unless start_index = users["links"]["next"][/startIndex=(\d+)/, 1] end @@ -51,25 +54,23 @@ class ImportScripts::JiveApi < ImportScripts::Base puts "", "importing discussions & questions..." start_index = 0 - fields = "fields=published,tags,contentID,author.id,content.text,subject,viewCount,question,-resources,-author.resources" + fields = "fields=published,contentID,author.id,content.text,subject,viewCount,question,-resources,-author.resources" filter = "&filter=creationDate(null,2017-01-01T00:00:00Z)" loop do - discussions = get("contents?#{fields}&filter=status(published)&filter=type(discussion)#{filter}&sort=dateCreatedAsc&count=#{COUNT}&startIndex=#{start_index}") + discussions = get("contents?#{fields}&filter=status(published)&filter=type(discussion)#{filter}&sort=dateCreatedAsc&count=#{POST_COUNT}&startIndex=#{start_index}") discussions["list"].each do |discussion| topic = { id: discussion["contentID"], created_at: discussion["published"], title: @htmlentities.decode(discussion["subject"]), - raw: discussion["content"]["text"], + raw: process_raw(discussion["content"]["text"]), user_id: user_id_from_imported_user_id(discussion["author"]["id"]) || Discourse::SYSTEM_USER_ID, - # category: discussion["question"] ? 26 : 21, + # category: discussion["question"] ? 5 : 21, views: discussion["viewCount"], - cook_method: Post.cook_methods[:raw_html], custom_fields: { import_id: discussion["contentID"] }, post_create_action: proc do |post| - tags = discussion["tags"].compact.map(&:strip).select(&:present?) - DiscourseTagging.tag_topic_by_names(post.topic, STAFF_GUARDIAN, tags) unless tags.empty? + DiscourseTagging.tag_topic_by_names(post.topic, STAFF_GUARDIAN, ["legacy"]) end } @@ -79,7 +80,7 @@ class ImportScripts::JiveApi < ImportScripts::Base import_comments(discussion["contentID"], parent_post.topic_id) if parent_post end - break if discussions["list"].size < COUNT || discussions["links"].blank? || discussions["links"]["next"].blank? + break if discussions["list"].size < POST_COUNT || discussions["links"].blank? || discussions["links"]["next"].blank? break unless start_index = discussions["links"]["next"][/startIndex=(\d+)/, 1] end end @@ -89,7 +90,7 @@ class ImportScripts::JiveApi < ImportScripts::Base fields = "fields=published,author.id,content.text,parent,answer,-resources,-author.resources" loop do - comments = get("messages/contents/#{discussion_id}?#{fields}&count=#{COUNT}&startIndex=#{start_index}") + comments = get("messages/contents/#{discussion_id}?#{fields}&count=#{POST_COUNT}&startIndex=#{start_index}") comments["list"].each do |comment| next if post_id_from_imported_post_id(comment["id"]) @@ -98,8 +99,7 @@ class ImportScripts::JiveApi < ImportScripts::Base created_at: comment["published"], topic_id: topic_id, user_id: user_id_from_imported_user_id(comment["author"]["id"]) || Discourse::SYSTEM_USER_ID, - raw: comment["content"]["text"], - cook_method: Post.cook_methods[:raw_html], + raw: process_raw(comment["content"]["text"]), custom_fields: { import_id: comment["id"] }, } post[:custom_fields][:is_accepted_answer] = true if comment["answer"] @@ -113,7 +113,7 @@ class ImportScripts::JiveApi < ImportScripts::Base create_post(post, post[:id]) end - break if comments["list"].size < COUNT || comments["links"].blank? || comments["links"]["next"].blank? + break if comments["list"].size < POST_COUNT || comments["links"].blank? || comments["links"]["next"].blank? break unless start_index = comments["links"]["next"][/startIndex=(\d+)/, 1] end end @@ -122,37 +122,61 @@ class ImportScripts::JiveApi < ImportScripts::Base puts "", "importing blog posts..." start_index = 0 - fields = "fields=published,tags,contentID,author.id,content.text,subject,viewCount,permalink,-resources,-author.resources" + fields = "fields=published,contentID,author.id,content.text,subject,viewCount,permalink,-resources,-author.resources" filter = "&filter=creationDate(null,2016-05-01T00:00:00Z)" loop do - posts = get("contents?#{fields}&filter=status(published)&filter=type(post)#{filter}&sort=dateCreatedAsc&count=#{COUNT}&startIndex=#{start_index}") + posts = get("contents?#{fields}&filter=status(published)&filter=type(post)#{filter}&sort=dateCreatedAsc&count=#{POST_COUNT}&startIndex=#{start_index}") posts["list"].each do |post| next if post_id_from_imported_post_id(post["contentID"]) pp = { id: post["contentID"], created_at: post["published"], title: @htmlentities.decode(post["subject"]), - raw: post["content"]["text"], + raw: process_raw(post["content"]["text"]), user_id: user_id_from_imported_user_id(post["author"]["id"]) || Discourse::SYSTEM_USER_ID, - # category: 7, + category: 7, views: post["viewCount"], - cook_method: Post.cook_methods[:raw_html], custom_fields: { import_id: post["contentID"], import_permalink: post["permalink"] }, post_create_action: proc do |p| - tags = post["tags"].compact.map(&:strip).select(&:present?) - DiscourseTagging.tag_topic_by_names(p.topic, STAFF_GUARDIAN, tags) unless tags.empty? + DiscourseTagging.tag_topic_by_names(p.topic, STAFF_GUARDIAN, ["legacy"]) end } create_post(pp, pp[:id]) end - break if posts["list"].size < COUNT || posts["links"].blank? || posts["links"]["next"].blank? + break if posts["list"].size < POST_COUNT || posts["links"].blank? || posts["links"]["next"].blank? break unless start_index = posts["links"]["next"][/startIndex=(\d+)/, 1] end end + def process_raw(raw) + doc = Nokogiri::HTML.fragment(raw) + + # convert emoticon + doc.css("span.emoticon-inline").each do |span| + name = span["class"][/emoticon_(\w+)/, 1]&.downcase + name && Emoji.exists?(name) ? span.replace(":#{name}:") : span.remove + end + + # convert mentions + doc.css("a.jive-link-profile-small").each { |a| a.replace("@#{a.content}") } + + # fix links + # doc.css("a[href]").each do |a| + # if a["href"]["#{@base_uri}/docs/DOC-"] + # a["href"] = a["href"][/#{Regexp.escape(@base_uri)}\/docs\/DOC-\d+/] + # elsif a["href"][@base_uri] + # a.replace(a.inner_html) + # end + # end + + html = doc.at(".jive-rendered-content").to_html + + HtmlToMarkdown.new(html).to_markdown + end + def mark_topics_as_solved puts "", "Marking topics as solved..."