From cfd507822f9330967f3ed9f970505e7f4896b523 Mon Sep 17 00:00:00 2001 From: Guo Xiang Tan Date: Mon, 1 Apr 2019 10:14:29 +0800 Subject: [PATCH] PERF: Improve quality of `PostSearchData#raw_data`. (#7275) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit fixes the follow quality issue with `PostSearchData#raw_data`: 1. URLs are being tokenized and links with similar href and characters are being duplicated in the raw data. `Post#cooked`: ```

https://meta.discourse.org/some.png

``` `PostSearchData#raw_data` Before: ``` This is a test topic 0 Uncategorized https://meta.discourse.org/some.png discourse org/some png https://meta.discourse.org/some.png discourse org/some png ``` `PostSearchData#raw_data` After: ``` This is a test topic 0 Uncategorized https://meta.discourse.org/some.png meta discourse org ``` 2. Ligthbox being included in search pollutes the `PostSearchData#raw_data` unncessarily. From 28 March 2018 to 28 March 2019, searches for the term `image` on `meta.discourse.org` had a click through rate of 2.1%. Non-lightboxed images are not included in indexing for search yet we were indexing content within a lightbox. Also, search for terms like `image` was affected we were using `Pasted image` as the filename for uploads that were pasted. `Post#cooked` ```

Let me see how I can fix this image
\n

\nsome.png1750×2000\n

``` `PostSearchData#raw_data` Before: ``` This is a test topic 0 Uncategorized Let me see how I can fix this image some.png png https://meta.discourse.org/some.png discourse org/some png some.png png 1750×2000 ``` `PostSearchData#raw_data` After: ``` This is a test topic 0 Uncategorized Let me see how I can fix this image ``` In terms of indexing performance, we now have to parse the given HTML through nokogiri twice. However performance is not a huge worry here since a string length of 194170 takes only 30ms to scrub plus the indexing takes place in a background job. --- app/jobs/regular/process_post.rb | 2 +- app/models/post.rb | 6 ++++- app/services/search_indexer.rb | 38 +++++++++++++++++++++------- lib/cooked_post_processor.rb | 3 ++- spec/services/search_indexer_spec.rb | 38 +++++++++++++++++++++++++++- 5 files changed, 74 insertions(+), 13 deletions(-) diff --git a/app/jobs/regular/process_post.rb b/app/jobs/regular/process_post.rb index d6b3afa4d27..cd99dec5d63 100644 --- a/app/jobs/regular/process_post.rb +++ b/app/jobs/regular/process_post.rb @@ -32,7 +32,7 @@ module Jobs # TODO suicide if needed, let's gather a few here first Rails.logger.warn("Cooked post processor in FATAL state, bypassing. You need to urgently restart sidekiq\norig: #{orig_cooked}\nrecooked: #{recooked}\ncooked: #{cooked}\npost id: #{post.id}") else - post.update_column(:cooked, cp.html) + post.update!(cooked: cp.html) extract_links(post) post.publish_change_to_clients! :revised end diff --git a/app/models/post.rb b/app/models/post.rb index c7382046567..b835fea5a39 100644 --- a/app/models/post.rb +++ b/app/models/post.rb @@ -603,7 +603,11 @@ class Post < ActiveRecord::Base new_cooked = cook(raw, topic_id: topic_id, invalidate_oneboxes: invalidate_oneboxes) old_cooked = cooked - update_columns(cooked: new_cooked, baked_at: Time.new, baked_version: BAKED_VERSION) + self.update!( + cooked: new_cooked, + baked_at: Time.zone.now, + baked_version: BAKED_VERSION + ) if invalidate_broken_images custom_fields.delete(BROKEN_IMAGES) diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb index c7b07c504d5..f731e52e87e 100644 --- a/app/services/search_indexer.rb +++ b/app/services/search_indexer.rb @@ -19,11 +19,16 @@ class SearchIndexer # insert some extra words for I.am.a.word so "word" is tokenized # I.am.a.word becomes I.am.a.word am a word raw.gsub(/[^[:space:]]*[\.]+[^[:space:]]*/) do |with_dot| - split = with_dot.split(".") - if split.length > 1 - with_dot + ((+" ") << split[1..-1].join(" ")) + if with_dot.match?(PlainTextToMarkdown::URL_REGEX) + "#{with_dot} #{URI.parse(with_dot).hostname.gsub('.', ' ')}" else - with_dot + split = with_dot.split(".") + + if split.length > 1 + with_dot + ((+" ") << split[1..-1].join(" ")) + else + with_dot + end end end end @@ -183,19 +188,34 @@ class SearchIndexer def self.scrub(html, strip_diacritics: false) return +"" if html.blank? + document = Nokogiri::HTML("
#{html}
", nil, Encoding::UTF_8.to_s) + + document.css( + "div.#{CookedPostProcessor::LIGHTBOX_WRAPPER_CSS_CLASS}" + ).remove + + document.css("a[href]").each do |node| + node.remove_attribute("href") if node["href"] == node.text + end + me = new(strip_diacritics: strip_diacritics) - Nokogiri::HTML::SAX::Parser.new(me).parse("
#{html}
") + Nokogiri::HTML::SAX::Parser.new(me).parse(document.to_html) me.scrubbed.squish end ATTRIBUTES ||= %w{alt title href data-youtube-title} - def start_element(_, attributes = []) + def start_element(_name, attributes = []) attributes = Hash[*attributes.flatten] - ATTRIBUTES.each do |name| - if attributes[name].present? - characters(attributes[name]) unless name == "href" && UrlHelper.is_local(attributes[name]) + ATTRIBUTES.each do |attribute_name| + if attributes[attribute_name].present? && + !( + attribute_name == "href" && + UrlHelper.is_local(attributes[attribute_name]) + ) + + characters(attributes[attribute_name]) end end end diff --git a/lib/cooked_post_processor.rb b/lib/cooked_post_processor.rb index 364f1fcd160..68a3055011f 100644 --- a/lib/cooked_post_processor.rb +++ b/lib/cooked_post_processor.rb @@ -8,6 +8,7 @@ require_dependency 'quote_comparer' class CookedPostProcessor INLINE_ONEBOX_LOADING_CSS_CLASS = "inline-onebox-loading" INLINE_ONEBOX_CSS_CLASS = "inline-onebox" + LIGHTBOX_WRAPPER_CSS_CLASS = "lightbox-wrapper" LOADING_SIZE = 10 LOADING_COLORS = 32 @@ -367,7 +368,7 @@ class CookedPostProcessor def add_lightbox!(img, original_width, original_height, upload, cropped: false) # first, create a div to hold our lightbox - lightbox = create_node("div", "lightbox-wrapper") + lightbox = create_node("div", LIGHTBOX_WRAPPER_CSS_CLASS) img.add_next_sibling(lightbox) lightbox.add_child(img) diff --git a/spec/services/search_indexer_spec.rb b/spec/services/search_indexer_spec.rb index d748ec638a9..303458610b1 100644 --- a/spec/services/search_indexer_spec.rb +++ b/spec/services/search_indexer_spec.rb @@ -61,7 +61,7 @@ describe SearchIndexer do scrubbed = scrub(html) - expect(scrubbed).to eq("Discourse 51%20PM Untitled design (21).jpg Untitled%20design%20(21) Untitled design (21).jpg 1280x1136 472 KB") + expect(scrubbed).to eq("Discourse 51%20PM") end it 'correctly indexes a post according to version' do @@ -110,5 +110,41 @@ describe SearchIndexer do post.save!(validate: false) end.to_not change { PostSearchData.count } end + + it "should not tokenize urls and duplicate title and href in " do + post = Fabricate(:post, raw: <<~RAW) + https://meta.discourse.org/some.png + RAW + + post.rebake! + post.reload + topic = post.topic + + expect(post.post_search_data.raw_data).to eq( + "#{topic.title} #{topic.category.name} https://meta.discourse.org/some.png meta discourse org" + ) + end + + it 'should not include lightbox in search' do + Jobs.run_immediately! + SiteSetting.max_image_height = 2000 + SiteSetting.crawl_images = true + FastImage.expects(:size).returns([1750, 2000]) + + src = "https://meta.discourse.org/some.png" + + post = Fabricate(:post, raw: <<~RAW) + Let me see how I can fix this image + + RAW + + post.rebake! + post.reload + topic = post.topic + + expect(post.post_search_data.raw_data).to eq( + "#{topic.title} #{topic.category.name} Let me see how I can fix this image" + ) + end end end