From cfd507822f9330967f3ed9f970505e7f4896b523 Mon Sep 17 00:00:00 2001
From: Guo Xiang Tan
Date: Mon, 1 Apr 2019 10:14:29 +0800
Subject: [PATCH] PERF: Improve quality of `PostSearchData#raw_data`. (#7275)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This commit fixes the follow quality issue with `PostSearchData#raw_data`:
1. URLs are being tokenized and links with similar href and characters
are being duplicated in the raw data.
`Post#cooked`:
```
https://meta.discourse.org/some.png
```
`PostSearchData#raw_data` Before:
```
This is a test topic 0 Uncategorized https://meta.discourse.org/some.png discourse org/some png https://meta.discourse.org/some.png discourse org/some png
```
`PostSearchData#raw_data` After:
```
This is a test topic 0 Uncategorized https://meta.discourse.org/some.png meta discourse org
```
2. Ligthbox being included in search pollutes the
`PostSearchData#raw_data` unncessarily.
From 28 March 2018 to 28 March 2019, searches for the term `image` on
`meta.discourse.org` had a click through rate of 2.1%. Non-lightboxed images are not included in indexing for search yet we were indexing content within a lightbox. Also, search for terms like `image` was affected we were using `Pasted image` as the filename for
uploads that were pasted.
`Post#cooked`
```
Let me see how I can fix this image
\n
```
`PostSearchData#raw_data` Before:
```
This is a test topic 0 Uncategorized Let me see how I can fix this image some.png png https://meta.discourse.org/some.png discourse org/some png some.png png 1750×2000
```
`PostSearchData#raw_data` After:
```
This is a test topic 0 Uncategorized Let me see how I can fix this image
```
In terms of indexing performance, we now have to parse the given HTML
through nokogiri twice. However performance is not a huge worry here since a string length of 194170 takes only 30ms
to scrub plus the indexing takes place in a background job.
---
app/jobs/regular/process_post.rb | 2 +-
app/models/post.rb | 6 ++++-
app/services/search_indexer.rb | 38 +++++++++++++++++++++-------
lib/cooked_post_processor.rb | 3 ++-
spec/services/search_indexer_spec.rb | 38 +++++++++++++++++++++++++++-
5 files changed, 74 insertions(+), 13 deletions(-)
diff --git a/app/jobs/regular/process_post.rb b/app/jobs/regular/process_post.rb
index d6b3afa4d27..cd99dec5d63 100644
--- a/app/jobs/regular/process_post.rb
+++ b/app/jobs/regular/process_post.rb
@@ -32,7 +32,7 @@ module Jobs
# TODO suicide if needed, let's gather a few here first
Rails.logger.warn("Cooked post processor in FATAL state, bypassing. You need to urgently restart sidekiq\norig: #{orig_cooked}\nrecooked: #{recooked}\ncooked: #{cooked}\npost id: #{post.id}")
else
- post.update_column(:cooked, cp.html)
+ post.update!(cooked: cp.html)
extract_links(post)
post.publish_change_to_clients! :revised
end
diff --git a/app/models/post.rb b/app/models/post.rb
index c7382046567..b835fea5a39 100644
--- a/app/models/post.rb
+++ b/app/models/post.rb
@@ -603,7 +603,11 @@ class Post < ActiveRecord::Base
new_cooked = cook(raw, topic_id: topic_id, invalidate_oneboxes: invalidate_oneboxes)
old_cooked = cooked
- update_columns(cooked: new_cooked, baked_at: Time.new, baked_version: BAKED_VERSION)
+ self.update!(
+ cooked: new_cooked,
+ baked_at: Time.zone.now,
+ baked_version: BAKED_VERSION
+ )
if invalidate_broken_images
custom_fields.delete(BROKEN_IMAGES)
diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb
index c7b07c504d5..f731e52e87e 100644
--- a/app/services/search_indexer.rb
+++ b/app/services/search_indexer.rb
@@ -19,11 +19,16 @@ class SearchIndexer
# insert some extra words for I.am.a.word so "word" is tokenized
# I.am.a.word becomes I.am.a.word am a word
raw.gsub(/[^[:space:]]*[\.]+[^[:space:]]*/) do |with_dot|
- split = with_dot.split(".")
- if split.length > 1
- with_dot + ((+" ") << split[1..-1].join(" "))
+ if with_dot.match?(PlainTextToMarkdown::URL_REGEX)
+ "#{with_dot} #{URI.parse(with_dot).hostname.gsub('.', ' ')}"
else
- with_dot
+ split = with_dot.split(".")
+
+ if split.length > 1
+ with_dot + ((+" ") << split[1..-1].join(" "))
+ else
+ with_dot
+ end
end
end
end
@@ -183,19 +188,34 @@ class SearchIndexer
def self.scrub(html, strip_diacritics: false)
return +"" if html.blank?
+ document = Nokogiri::HTML("#{html}
", nil, Encoding::UTF_8.to_s)
+
+ document.css(
+ "div.#{CookedPostProcessor::LIGHTBOX_WRAPPER_CSS_CLASS}"
+ ).remove
+
+ document.css("a[href]").each do |node|
+ node.remove_attribute("href") if node["href"] == node.text
+ end
+
me = new(strip_diacritics: strip_diacritics)
- Nokogiri::HTML::SAX::Parser.new(me).parse("#{html}
")
+ Nokogiri::HTML::SAX::Parser.new(me).parse(document.to_html)
me.scrubbed.squish
end
ATTRIBUTES ||= %w{alt title href data-youtube-title}
- def start_element(_, attributes = [])
+ def start_element(_name, attributes = [])
attributes = Hash[*attributes.flatten]
- ATTRIBUTES.each do |name|
- if attributes[name].present?
- characters(attributes[name]) unless name == "href" && UrlHelper.is_local(attributes[name])
+ ATTRIBUTES.each do |attribute_name|
+ if attributes[attribute_name].present? &&
+ !(
+ attribute_name == "href" &&
+ UrlHelper.is_local(attributes[attribute_name])
+ )
+
+ characters(attributes[attribute_name])
end
end
end
diff --git a/lib/cooked_post_processor.rb b/lib/cooked_post_processor.rb
index 364f1fcd160..68a3055011f 100644
--- a/lib/cooked_post_processor.rb
+++ b/lib/cooked_post_processor.rb
@@ -8,6 +8,7 @@ require_dependency 'quote_comparer'
class CookedPostProcessor
INLINE_ONEBOX_LOADING_CSS_CLASS = "inline-onebox-loading"
INLINE_ONEBOX_CSS_CLASS = "inline-onebox"
+ LIGHTBOX_WRAPPER_CSS_CLASS = "lightbox-wrapper"
LOADING_SIZE = 10
LOADING_COLORS = 32
@@ -367,7 +368,7 @@ class CookedPostProcessor
def add_lightbox!(img, original_width, original_height, upload, cropped: false)
# first, create a div to hold our lightbox
- lightbox = create_node("div", "lightbox-wrapper")
+ lightbox = create_node("div", LIGHTBOX_WRAPPER_CSS_CLASS)
img.add_next_sibling(lightbox)
lightbox.add_child(img)
diff --git a/spec/services/search_indexer_spec.rb b/spec/services/search_indexer_spec.rb
index d748ec638a9..303458610b1 100644
--- a/spec/services/search_indexer_spec.rb
+++ b/spec/services/search_indexer_spec.rb
@@ -61,7 +61,7 @@ describe SearchIndexer do
scrubbed = scrub(html)
- expect(scrubbed).to eq("Discourse 51%20PM Untitled design (21).jpg Untitled%20design%20(21) Untitled design (21).jpg 1280x1136 472 KB")
+ expect(scrubbed).to eq("Discourse 51%20PM")
end
it 'correctly indexes a post according to version' do
@@ -110,5 +110,41 @@ describe SearchIndexer do
post.save!(validate: false)
end.to_not change { PostSearchData.count }
end
+
+ it "should not tokenize urls and duplicate title and href in " do
+ post = Fabricate(:post, raw: <<~RAW)
+ https://meta.discourse.org/some.png
+ RAW
+
+ post.rebake!
+ post.reload
+ topic = post.topic
+
+ expect(post.post_search_data.raw_data).to eq(
+ "#{topic.title} #{topic.category.name} https://meta.discourse.org/some.png meta discourse org"
+ )
+ end
+
+ it 'should not include lightbox in search' do
+ Jobs.run_immediately!
+ SiteSetting.max_image_height = 2000
+ SiteSetting.crawl_images = true
+ FastImage.expects(:size).returns([1750, 2000])
+
+ src = "https://meta.discourse.org/some.png"
+
+ post = Fabricate(:post, raw: <<~RAW)
+ Let me see how I can fix this image
+
+ RAW
+
+ post.rebake!
+ post.reload
+ topic = post.topic
+
+ expect(post.post_search_data.raw_data).to eq(
+ "#{topic.title} #{topic.category.name} Let me see how I can fix this image"
+ )
+ end
end
end