New yahoo groups importer

This commit is contained in:
Jon Bartlett 2017-03-24 13:49:32 +11:00 committed by Gerhard Schlager
parent a3a0e36563
commit 5d8508c523
1 changed files with 159 additions and 0 deletions
script/import_scripts

View File

@ -0,0 +1,159 @@
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
require 'mongo'
# Import YahooGroups data as exported into MongoDB by:
# https://github.com/jonbartlett/yahoo-groups-export
#
# Optionally paste these lines into your shell before running this:
#
# =begin
# export CATEGORY_ID=<CATEGORY_ID>
# =end
class ImportScripts::YahooGroup < ImportScripts::Base
MONGODB_HOST = '192.168.10.1:27017'
MONGODB_DB = 'syncro'
def initialize
super
client = Mongo::Client.new([ MONGODB_HOST ], database: MONGODB_DB)
db = client.database
Mongo::Logger.logger.level = Logger::FATAL
puts "connected to db...."
@collection = client[:posts]
@user_profile_map = {}
end
def execute
puts "", "Importing from Mongodb...."
import_users
import_discussions
puts "", "Done"
end
def import_users
puts '', "Importing users"
# fetch distinct list of Yahoo "profile" names
profiles = @collection.aggregate(
[
{ "$group": { "_id": { profile: "$ygData.profile" } } }
]
)
user_id = 0
create_users(profiles.to_a) do |u|
user_id = user_id + 1
# fetch last message for profile to pickup latest user info as this may have changed
user_info = @collection.find("ygData.profile": u["_id"]["profile"]).sort("ygData.msgId": -1).limit(1).to_a[0]
# Store user_id to profile lookup
@user_profile_map.store(user_info["ygData"]["profile"], user_id)
puts "User created: #{user_info["ygData"]["profile"]}"
user =
{
id: user_id, # yahoo "userId" sequence appears to have changed mid forum life so generate this
username: user_info["ygData"]["profile"],
name: user_info["ygData"]["authorName"],
email: user_info["ygData"]["from"], # mandatory
created_at: Time.now
}
user
end
puts "#{user_id} users created"
end
def import_discussions
puts "", "Importing discussions"
topics_count = 0
posts_count = 0
topics = @collection.aggregate(
[
{ "$group": { "_id": { topicId: "$ygData.topicId" } } }
]
).to_a
# for each distinct topicId found
topics.each_with_index do |t, tidx|
# create "topic" post first.
# fetch topic document
topic_post = @collection.find("ygData.msgId": t["_id"]["topicId"]).to_a[0]
next if topic_post.nil?
puts "Topic: #{tidx + 1} / #{topics.count()} (#{sprintf('%.2f', ((tidx + 1).to_f / topics.count().to_f) * 100)}%) Subject: #{topic_post["ygData"]["subject"]}"
if topic_post["ygData"]["subject"].to_s.empty?
topic_title = "No Subject"
else
topic_title = topic_post["ygData"]["subject"]
end
topic = {
id: tidx + 1,
user_id: @user_profile_map[topic_post["ygData"]["profile"]] || -1,
raw: topic_post["ygData"]["messageBody"],
created_at: Time.at(topic_post["ygData"]["postDate"].to_i),
cook_method: Post.cook_methods[:raw_html],
title: topic_title,
category: ENV['CATEGORY_ID'],
custom_fields: { import_id: topic_post["ygData"]["msgId"] }
}
topics_count += 1
# create topic post
parent_post = create_post(topic, topic[:id])
# find all posts for topic id
posts = @collection.find("ygData.topicId": topic_post["ygData"]["topicId"]).to_a
posts.each_with_index do |p, pidx|
# skip over first post as this is created by topic above
next if p["ygData"]["msgId"] == topic_post["ygData"]["topicId"]
puts " Post: #{pidx + 1} / #{posts.count()}"
post = {
id: pidx + 1,
topic_id: parent_post[:topic_id],
user_id: @user_profile_map[p["ygData"]["profile"]] || -1,
raw: p["ygData"]["messageBody"],
created_at: Time.at(p["ygData"]["postDate"].to_i),
cook_method: Post.cook_methods[:raw_html],
custom_fields: { import_id: p["ygData"]["msgId"] }
}
child_post = create_post(post, post[:id])
posts_count += 1
end
end
puts "", "Imported #{topics_count} topics with #{topics_count + posts_count} posts."
end
end
ImportScripts::YahooGroup.new.perform