2018-03-15 17:10:45 -04:00
|
|
|
class WebCrawlerRequest < ActiveRecord::Base
|
|
|
|
include CachedCounting
|
|
|
|
|
|
|
|
# auto flush if older than this
|
|
|
|
self.autoflush_seconds = 1.hour
|
|
|
|
|
|
|
|
cattr_accessor :max_record_age, :max_records_per_day
|
|
|
|
|
|
|
|
# only keep the top records based on request count
|
|
|
|
self.max_records_per_day = 200
|
|
|
|
|
|
|
|
# delete records older than this
|
|
|
|
self.max_record_age = 30.days
|
|
|
|
|
|
|
|
def self.increment!(user_agent, opts = nil)
|
|
|
|
ua_list_key = user_agent_list_key
|
|
|
|
$redis.sadd(ua_list_key, user_agent)
|
|
|
|
$redis.expire(ua_list_key, 259200) # 3.days
|
|
|
|
|
|
|
|
perform_increment!(redis_key(user_agent), opts)
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.write_cache!(date = nil)
|
|
|
|
if date.nil?
|
|
|
|
write_cache!(Time.now.utc)
|
|
|
|
write_cache!(Time.now.utc.yesterday)
|
|
|
|
return
|
|
|
|
end
|
|
|
|
|
|
|
|
self.last_flush = Time.now.utc
|
|
|
|
|
|
|
|
date = date.to_date
|
2018-03-27 13:44:14 -04:00
|
|
|
ua_list_key = user_agent_list_key(date)
|
2018-03-15 17:10:45 -04:00
|
|
|
|
2018-03-27 13:44:14 -04:00
|
|
|
while user_agent = $redis.spop(ua_list_key)
|
2018-03-15 17:10:45 -04:00
|
|
|
val = get_and_reset(redis_key(user_agent, date))
|
|
|
|
|
|
|
|
next if val == 0
|
|
|
|
|
|
|
|
self.where(id: req_id(date, user_agent)).update_all(["count = count + ?", val])
|
|
|
|
end
|
|
|
|
rescue Redis::CommandError => e
|
|
|
|
raise unless e.message =~ /READONLY/
|
|
|
|
nil
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.clear_cache!(date = nil)
|
|
|
|
if date.nil?
|
|
|
|
clear_cache!(Time.now.utc)
|
|
|
|
clear_cache!(Time.now.utc.yesterday)
|
|
|
|
return
|
|
|
|
end
|
|
|
|
|
2018-03-27 15:11:48 -04:00
|
|
|
ua_list_key = user_agent_list_key(date)
|
2018-03-15 17:10:45 -04:00
|
|
|
|
2018-03-27 15:11:48 -04:00
|
|
|
while user_agent = $redis.spop(ua_list_key)
|
2018-03-15 17:10:45 -04:00
|
|
|
$redis.del redis_key(user_agent, date)
|
|
|
|
end
|
|
|
|
|
2018-03-27 15:11:48 -04:00
|
|
|
$redis.del(ua_list_key)
|
2018-03-15 17:10:45 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
protected
|
|
|
|
|
|
|
|
def self.user_agent_list_key(time = Time.now.utc)
|
|
|
|
"crawl_ua_list:#{time.strftime('%Y%m%d')}"
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.redis_key(user_agent, time = Time.now.utc)
|
|
|
|
"crawl_req:#{time.strftime('%Y%m%d')}:#{user_agent}"
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.req_id(date, user_agent)
|
|
|
|
request_id(date: date, user_agent: user_agent)
|
|
|
|
end
|
|
|
|
end
|
2018-03-28 14:40:26 -04:00
|
|
|
|
|
|
|
# == Schema Information
|
|
|
|
#
|
|
|
|
# Table name: web_crawler_requests
|
|
|
|
#
|
2019-05-02 18:34:12 -04:00
|
|
|
# id :bigint not null, primary key
|
2018-03-28 14:40:26 -04:00
|
|
|
# date :date not null
|
|
|
|
# user_agent :string not null
|
|
|
|
# count :integer default(0), not null
|
|
|
|
#
|
|
|
|
# Indexes
|
|
|
|
#
|
|
|
|
# index_web_crawler_requests_on_date_and_user_agent (date,user_agent) UNIQUE
|
|
|
|
#
|