Merge pull request #939 from novemberkilo/master

Reduce complexity of Post - introduce a PostAnalyzer class
This commit is contained in:
Robin Ward 2013-06-01 10:18:26 -07:00
commit b4544eb027
4 changed files with 280 additions and 30 deletions

1
.gitignore vendored
View File

@ -35,6 +35,7 @@ config/discourse.pill
# Ignore all logfiles and tempfiles. # Ignore all logfiles and tempfiles.
/log/*.log /log/*.log
/tmp /tmp
/logfile
# Ignore Eclipse .project file # Ignore Eclipse .project file
/.project /.project

View File

@ -4,7 +4,7 @@ require_dependency 'rate_limiter'
require_dependency 'post_revisor' require_dependency 'post_revisor'
require_dependency 'enum' require_dependency 'enum'
require_dependency 'trashable' require_dependency 'trashable'
require_dependency 'post_analyser' require_dependency 'post_analyzer'
require 'archetype' require 'archetype'
require 'digest/sha1' require 'digest/sha1'
@ -12,7 +12,6 @@ require 'digest/sha1'
class Post < ActiveRecord::Base class Post < ActiveRecord::Base
include RateLimiter::OnCreateRecord include RateLimiter::OnCreateRecord
include Trashable include Trashable
include PostAnalyser
versioned if: :raw_changed? versioned if: :raw_changed?
@ -90,11 +89,6 @@ class Post < ActiveRecord::Base
Digest::SHA1.hexdigest(raw.gsub(/\s+/, "").downcase) Digest::SHA1.hexdigest(raw.gsub(/\s+/, "").downcase)
end end
def cooked_document
self.cooked ||= cook(raw, topic_id: topic_id)
@cooked_document ||= Nokogiri::HTML.fragment(cooked)
end
def reset_cooked def reset_cooked
@cooked_document = nil @cooked_document = nil
self.cooked = nil self.cooked = nil
@ -104,16 +98,18 @@ class Post < ActiveRecord::Base
@white_listed_image_classes ||= ['avatar', 'favicon', 'thumbnail'] @white_listed_image_classes ||= ['avatar', 'favicon', 'thumbnail']
end end
# How many images are present in the post def post_analyzer
def image_count @post_analyzer = PostAnalyzer.new(raw, topic_id)
return 0 unless raw.present? end
cooked_document.search("img").reject do |t| %w{raw_mentions linked_hosts image_count link_count raw_links}.each do |attr|
dom_class = t["class"] define_method(attr) do
if dom_class PostAnalyzer.new(raw, topic_id).send(attr)
(Post.white_listed_image_classes & dom_class.split(" ")).count > 0 end
end end
end.count
def cook(*args)
PostAnalyzer.new(raw, topic_id).cook(*args)
end end
@ -236,20 +232,6 @@ class Post < ActiveRecord::Base
Post.excerpt(cooked, maxlength, options) Post.excerpt(cooked, maxlength, options)
end end
# What we use to cook posts
def cook(*args)
cooked = PrettyText.cook(*args)
# If we have any of the oneboxes in the cache, throw them in right away, don't
# wait for the post processor.
dirty = false
result = Oneboxer.apply(cooked) do |url, elem|
Oneboxer.render_from_cache(url)
end
cooked = result.to_html if result.changed?
cooked
end
# A list of versions including the initial version # A list of versions including the initial version
def all_versions def all_versions

101
app/models/post_analyzer.rb Normal file
View File

@ -0,0 +1,101 @@
class PostAnalyzer
attr_accessor :cooked, :raw
def initialize(raw, topic_id)
@raw = raw
@topic_id = topic_id
end
def cooked_document
@cooked = cook(@raw, topic_id: @topic_id)
@cooked_document = Nokogiri::HTML.fragment(@cooked)
end
# What we use to cook posts
def cook(*args)
cooked = PrettyText.cook(*args)
# If we have any of the oneboxes in the cache, throw them in right away, don't
# wait for the post processor.
dirty = false
result = Oneboxer.apply(cooked) do |url, elem|
Oneboxer.render_from_cache(url)
end
cooked = result.to_html if result.changed?
cooked
end
# How many images are present in the post
def image_count
return 0 unless @raw.present?
cooked_document.search("img").reject do |t|
dom_class = t["class"]
if dom_class
(Post.white_listed_image_classes & dom_class.split(" ")).count > 0
end
end.count
end
def raw_mentions
return [] if @raw.blank?
# We don't count mentions in quotes
return @raw_mentions if @raw_mentions.present?
raw_stripped = @raw.gsub(/\[quote=(.*)\]([^\[]*?)\[\/quote\]/im, '')
# Strip pre and code tags
doc = Nokogiri::HTML.fragment(raw_stripped)
doc.search("pre").remove
doc.search("code").remove
results = doc.to_html.scan(PrettyText.mention_matcher)
@raw_mentions = results.uniq.map { |un| un.first.downcase.gsub!(/^@/, '') }
end
# Count how many hosts are linked in the post
def linked_hosts
return {} if raw_links.blank?
return @linked_hosts if @linked_hosts.present?
@linked_hosts = {}
raw_links.each do |u|
uri = URI.parse(u)
host = uri.host
@linked_hosts[host] ||= 1
end
@linked_hosts
end
# Returns an array of all links in a post excluding mentions
def raw_links
return [] unless @raw.present?
return @raw_links if @raw_links.present?
# Don't include @mentions in the link count
@raw_links = []
cooked_document.search("a[href]").each do |l|
next if link_is_a_mention?(l)
url = l.attributes['href'].to_s
@raw_links << url
end
@raw_links
end
# How many links are present in the post
def link_count
raw_links.size
end
private
def link_is_a_mention?(l)
html_class = l.attributes['class']
return false if html_class.nil?
html_class.to_s == 'mention' && l.attributes['href'].to_s =~ /^\/users\//
end
end

View File

@ -0,0 +1,166 @@
require 'spec_helper'
describe PostAnalyzer do
let(:topic) { Fabricate(:topic) }
let(:default_topic_id) { topic.id }
let(:post_args) do
{user: topic.user, topic: topic}
end
context "links" do
let(:raw_no_links) { "hello world my name is evil trout" }
let(:raw_one_link_md) { "[jlawr](http://www.imdb.com/name/nm2225369)" }
let(:raw_two_links_html) { "<a href='http://disneyland.disney.go.com/'>disney</a> <a href='http://reddit.com'>reddit</a>"}
let(:raw_three_links) { "http://discourse.org and http://discourse.org/another_url and http://www.imdb.com/name/nm2225369"}
describe "raw_links" do
it "returns a blank collection for a post with no links" do
post_analyzer = PostAnalyzer.new(raw_no_links, default_topic_id)
post_analyzer.raw_links.should be_blank
end
it "finds a link within markdown" do
post_analyzer = PostAnalyzer.new(raw_one_link_md, default_topic_id)
post_analyzer.raw_links.should == ["http://www.imdb.com/name/nm2225369"]
end
it "can find two links from html" do
post_analyzer = PostAnalyzer.new(raw_two_links_html, default_topic_id)
post_analyzer.raw_links.should == ["http://disneyland.disney.go.com/", "http://reddit.com"]
end
it "can find three links without markup" do
post_analyzer = PostAnalyzer.new(raw_three_links, default_topic_id)
post_analyzer.raw_links.should == ["http://discourse.org", "http://discourse.org/another_url", "http://www.imdb.com/name/nm2225369"]
end
end
describe "linked_hosts" do
it "returns blank with no links" do
post_analyzer = PostAnalyzer.new(raw_no_links, default_topic_id)
post_analyzer.linked_hosts.should be_blank
end
it "returns the host and a count for links" do
post_analyzer = PostAnalyzer.new(raw_two_links_html, default_topic_id)
post_analyzer.linked_hosts.should == {"disneyland.disney.go.com" => 1, "reddit.com" => 1}
end
it "it counts properly with more than one link on the same host" do
post_analyzer = PostAnalyzer.new(raw_three_links, default_topic_id)
post_analyzer.linked_hosts.should == {"discourse.org" => 1, "www.imdb.com" => 1}
end
end
end
describe "image_count" do
let(:raw_post_one_image_md) { "![sherlock](http://bbc.co.uk/sherlock.jpg)" }
let(:raw_post_two_images_html) { "<img src='http://discourse.org/logo.png'> <img src='http://bbc.co.uk/sherlock.jpg'>" }
let(:raw_post_with_avatars) { '<img alt="smiley" title=":smiley:" src="/assets/emoji/smiley.png" class="avatar"> <img alt="wink" title=":wink:" src="/assets/emoji/wink.png" class="avatar">' }
let(:raw_post_with_favicon) { '<img src="/assets/favicons/wikipedia.png" class="favicon">' }
let(:raw_post_with_thumbnail) { '<img src="/assets/emoji/smiley.png" class="thumbnail">' }
let(:raw_post_with_two_classy_images) { "<img src='http://discourse.org/logo.png' class='classy'> <img src='http://bbc.co.uk/sherlock.jpg' class='classy'>" }
it "returns 0 images for an empty post" do
post_analyzer = PostAnalyzer.new("Hello world", nil)
post_analyzer.image_count.should == 0
end
it "finds images from markdown" do
post_analyzer = PostAnalyzer.new(raw_post_one_image_md, default_topic_id)
post_analyzer.image_count.should == 1
end
it "finds images from HTML" do
post_analyzer = PostAnalyzer.new(raw_post_two_images_html, default_topic_id)
post_analyzer.image_count.should == 2
end
it "doesn't count avatars as images" do
post_analyzer = PostAnalyzer.new(raw_post_with_avatars, default_topic_id)
post_analyzer.image_count.should == 0
end
it "doesn't count favicons as images" do
post_analyzer = PostAnalyzer.new(raw_post_with_favicon, default_topic_id)
post_analyzer.image_count.should == 0
end
it "doesn't count thumbnails as images" do
post_analyzer = PostAnalyzer.new(raw_post_with_thumbnail, default_topic_id)
post_analyzer.image_count.should == 0
end
it "doesn't count whitelisted images" do
Post.stubs(:white_listed_image_classes).returns(["classy"])
post_analyzer = PostAnalyzer.new(raw_post_with_two_classy_images, default_topic_id)
post_analyzer.image_count.should == 0
end
end
describe "link_count" do
let(:raw_post_one_link_md) { "[sherlock](http://www.bbc.co.uk/programmes/b018ttws)" }
let(:raw_post_two_links_html) { "<a href='http://discourse.org'>discourse</a> <a href='http://twitter.com'>twitter</a>" }
let(:raw_post_with_mentions) { "hello @novemberkilo how are you doing?" }
it "returns 0 links for an empty post" do
post_analyzer = PostAnalyzer.new("Hello world", nil)
post_analyzer.link_count.should == 0
end
it "returns 0 links for a post with mentions" do
post_analyzer = PostAnalyzer.new(raw_post_with_mentions, default_topic_id)
post_analyzer.link_count.should == 0
end
it "finds links from markdown" do
post_analyzer = PostAnalyzer.new(raw_post_one_link_md, default_topic_id)
post_analyzer.link_count.should == 1
end
it "finds links from HTML" do
post_analyzer = PostAnalyzer.new(raw_post_two_links_html, default_topic_id)
post_analyzer.link_count.should == 2
end
end
describe "raw_mentions" do
it "returns an empty array with no matches" do
post_analyzer = PostAnalyzer.new("Hello Jake and Finn!", default_topic_id)
post_analyzer.raw_mentions.should == []
end
it "returns lowercase unique versions of the mentions" do
post_analyzer = PostAnalyzer.new("@Jake @Finn @Jake", default_topic_id)
post_analyzer.raw_mentions.should == ['jake', 'finn']
end
it "ignores pre" do
post_analyzer = PostAnalyzer.new("<pre>@Jake</pre> @Finn", default_topic_id)
post_analyzer.raw_mentions.should == ['finn']
end
it "catches content between pre tags" do
post_analyzer = PostAnalyzer.new("<pre>hello</pre> @Finn <pre></pre>", default_topic_id)
post_analyzer.raw_mentions.should == ['finn']
end
it "ignores code" do
post_analyzer = PostAnalyzer.new("@Jake <code>@Finn</code>", default_topic_id)
post_analyzer.raw_mentions.should == ['jake']
end
it "ignores quotes" do
post_analyzer = PostAnalyzer.new("[quote=\"Evil Trout\"]@Jake[/quote] @Finn", default_topic_id)
post_analyzer.raw_mentions.should == ['finn']
end
it "handles underscore in username" do
post_analyzer = PostAnalyzer.new("@Jake @Finn @Jake_Old", default_topic_id)
post_analyzer.raw_mentions.should == ['jake', 'finn', 'jake_old']
end
end
end