PERF: Improve quality of `PostSearchData#raw_data`. (#7275)
This commit fixes the follow quality issue with `PostSearchData#raw_data`: 1. URLs are being tokenized and links with similar href and characters are being duplicated in the raw data. `Post#cooked`: ``` <p><a href=\"https://meta.discourse.org/some.png\" class=\"onebox\" target=\"_blank\" rel=\"nofollow noopener\">https://meta.discourse.org/some.png</a></p> ``` `PostSearchData#raw_data` Before: ``` This is a test topic 0 Uncategorized https://meta.discourse.org/some.png discourse org/some png https://meta.discourse.org/some.png discourse org/some png ``` `PostSearchData#raw_data` After: ``` This is a test topic 0 Uncategorized https://meta.discourse.org/some.png meta discourse org ``` 2. Ligthbox being included in search pollutes the `PostSearchData#raw_data` unncessarily. From 28 March 2018 to 28 March 2019, searches for the term `image` on `meta.discourse.org` had a click through rate of 2.1%. Non-lightboxed images are not included in indexing for search yet we were indexing content within a lightbox. Also, search for terms like `image` was affected we were using `Pasted image` as the filename for uploads that were pasted. `Post#cooked` ``` <p>Let me see how I can fix this image<br>\n<div class=\"lightbox-wrapper\"><a class=\"lightbox\" href=\"https://meta.discourse.org/some.png\" title=\"some.png\" rel=\"nofollow noopener\"><img src=\"https://meta.discourse.org/some.png\" width=\"275\" height=\"299\"><div class=\"meta\">\n<svg class=\"fa d-icon d-icon-far-image svg-icon\" aria-hidden=\"true\"><use xlink:href=\"#far-image\"></use></svg><span class=\"filename\">some.png</span><span class=\"informations\">1750×2000</span><svg class=\"fa d-icon d-icon-discourse-expand svg-icon\" aria-hidden=\"true\"><use xlink:href=\"#discourse-expand\"></use></svg>\n</div></a></div></p> ``` `PostSearchData#raw_data` Before: ``` This is a test topic 0 Uncategorized Let me see how I can fix this image some.png png https://meta.discourse.org/some.png discourse org/some png some.png png 1750×2000 ``` `PostSearchData#raw_data` After: ``` This is a test topic 0 Uncategorized Let me see how I can fix this image ``` In terms of indexing performance, we now have to parse the given HTML through nokogiri twice. However performance is not a huge worry here since a string length of 194170 takes only 30ms to scrub plus the indexing takes place in a background job.
This commit is contained in:
parent
7ac76fe935
commit
cfd507822f
|
@ -32,7 +32,7 @@ module Jobs
|
|||
# TODO suicide if needed, let's gather a few here first
|
||||
Rails.logger.warn("Cooked post processor in FATAL state, bypassing. You need to urgently restart sidekiq\norig: #{orig_cooked}\nrecooked: #{recooked}\ncooked: #{cooked}\npost id: #{post.id}")
|
||||
else
|
||||
post.update_column(:cooked, cp.html)
|
||||
post.update!(cooked: cp.html)
|
||||
extract_links(post)
|
||||
post.publish_change_to_clients! :revised
|
||||
end
|
||||
|
|
|
@ -603,7 +603,11 @@ class Post < ActiveRecord::Base
|
|||
new_cooked = cook(raw, topic_id: topic_id, invalidate_oneboxes: invalidate_oneboxes)
|
||||
old_cooked = cooked
|
||||
|
||||
update_columns(cooked: new_cooked, baked_at: Time.new, baked_version: BAKED_VERSION)
|
||||
self.update!(
|
||||
cooked: new_cooked,
|
||||
baked_at: Time.zone.now,
|
||||
baked_version: BAKED_VERSION
|
||||
)
|
||||
|
||||
if invalidate_broken_images
|
||||
custom_fields.delete(BROKEN_IMAGES)
|
||||
|
|
|
@ -19,11 +19,16 @@ class SearchIndexer
|
|||
# insert some extra words for I.am.a.word so "word" is tokenized
|
||||
# I.am.a.word becomes I.am.a.word am a word
|
||||
raw.gsub(/[^[:space:]]*[\.]+[^[:space:]]*/) do |with_dot|
|
||||
split = with_dot.split(".")
|
||||
if split.length > 1
|
||||
with_dot + ((+" ") << split[1..-1].join(" "))
|
||||
if with_dot.match?(PlainTextToMarkdown::URL_REGEX)
|
||||
"#{with_dot} #{URI.parse(with_dot).hostname.gsub('.', ' ')}"
|
||||
else
|
||||
with_dot
|
||||
split = with_dot.split(".")
|
||||
|
||||
if split.length > 1
|
||||
with_dot + ((+" ") << split[1..-1].join(" "))
|
||||
else
|
||||
with_dot
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -183,19 +188,34 @@ class SearchIndexer
|
|||
def self.scrub(html, strip_diacritics: false)
|
||||
return +"" if html.blank?
|
||||
|
||||
document = Nokogiri::HTML("<div>#{html}</div>", nil, Encoding::UTF_8.to_s)
|
||||
|
||||
document.css(
|
||||
"div.#{CookedPostProcessor::LIGHTBOX_WRAPPER_CSS_CLASS}"
|
||||
).remove
|
||||
|
||||
document.css("a[href]").each do |node|
|
||||
node.remove_attribute("href") if node["href"] == node.text
|
||||
end
|
||||
|
||||
me = new(strip_diacritics: strip_diacritics)
|
||||
Nokogiri::HTML::SAX::Parser.new(me).parse("<div>#{html}</div>")
|
||||
Nokogiri::HTML::SAX::Parser.new(me).parse(document.to_html)
|
||||
me.scrubbed.squish
|
||||
end
|
||||
|
||||
ATTRIBUTES ||= %w{alt title href data-youtube-title}
|
||||
|
||||
def start_element(_, attributes = [])
|
||||
def start_element(_name, attributes = [])
|
||||
attributes = Hash[*attributes.flatten]
|
||||
|
||||
ATTRIBUTES.each do |name|
|
||||
if attributes[name].present?
|
||||
characters(attributes[name]) unless name == "href" && UrlHelper.is_local(attributes[name])
|
||||
ATTRIBUTES.each do |attribute_name|
|
||||
if attributes[attribute_name].present? &&
|
||||
!(
|
||||
attribute_name == "href" &&
|
||||
UrlHelper.is_local(attributes[attribute_name])
|
||||
)
|
||||
|
||||
characters(attributes[attribute_name])
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -8,6 +8,7 @@ require_dependency 'quote_comparer'
|
|||
class CookedPostProcessor
|
||||
INLINE_ONEBOX_LOADING_CSS_CLASS = "inline-onebox-loading"
|
||||
INLINE_ONEBOX_CSS_CLASS = "inline-onebox"
|
||||
LIGHTBOX_WRAPPER_CSS_CLASS = "lightbox-wrapper"
|
||||
LOADING_SIZE = 10
|
||||
LOADING_COLORS = 32
|
||||
|
||||
|
@ -367,7 +368,7 @@ class CookedPostProcessor
|
|||
|
||||
def add_lightbox!(img, original_width, original_height, upload, cropped: false)
|
||||
# first, create a div to hold our lightbox
|
||||
lightbox = create_node("div", "lightbox-wrapper")
|
||||
lightbox = create_node("div", LIGHTBOX_WRAPPER_CSS_CLASS)
|
||||
img.add_next_sibling(lightbox)
|
||||
lightbox.add_child(img)
|
||||
|
||||
|
|
|
@ -61,7 +61,7 @@ describe SearchIndexer do
|
|||
|
||||
scrubbed = scrub(html)
|
||||
|
||||
expect(scrubbed).to eq("Discourse 51%20PM Untitled design (21).jpg Untitled%20design%20(21) Untitled design (21).jpg 1280x1136 472 KB")
|
||||
expect(scrubbed).to eq("Discourse 51%20PM")
|
||||
end
|
||||
|
||||
it 'correctly indexes a post according to version' do
|
||||
|
@ -110,5 +110,41 @@ describe SearchIndexer do
|
|||
post.save!(validate: false)
|
||||
end.to_not change { PostSearchData.count }
|
||||
end
|
||||
|
||||
it "should not tokenize urls and duplicate title and href in <a>" do
|
||||
post = Fabricate(:post, raw: <<~RAW)
|
||||
https://meta.discourse.org/some.png
|
||||
RAW
|
||||
|
||||
post.rebake!
|
||||
post.reload
|
||||
topic = post.topic
|
||||
|
||||
expect(post.post_search_data.raw_data).to eq(
|
||||
"#{topic.title} #{topic.category.name} https://meta.discourse.org/some.png meta discourse org"
|
||||
)
|
||||
end
|
||||
|
||||
it 'should not include lightbox in search' do
|
||||
Jobs.run_immediately!
|
||||
SiteSetting.max_image_height = 2000
|
||||
SiteSetting.crawl_images = true
|
||||
FastImage.expects(:size).returns([1750, 2000])
|
||||
|
||||
src = "https://meta.discourse.org/some.png"
|
||||
|
||||
post = Fabricate(:post, raw: <<~RAW)
|
||||
Let me see how I can fix this image
|
||||
<img src="#{src}" width="275" height="299">
|
||||
RAW
|
||||
|
||||
post.rebake!
|
||||
post.reload
|
||||
topic = post.topic
|
||||
|
||||
expect(post.post_search_data.raw_data).to eq(
|
||||
"#{topic.title} #{topic.category.name} Let me see how I can fix this image"
|
||||
)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue