FEATURE: search within title using in:title

Also

- Significantly improved search ranking, title is treated most strongly
- Adds tag names to the index
- Run search re-indexer more aggressively
- Re-index topic and all posts on category change
This commit is contained in:
Sam 2018-02-20 14:41:00 +11:00
parent 73a492f721
commit 86d12bd44b
8 changed files with 187 additions and 110 deletions

View File

@ -1,7 +1,7 @@
module Jobs
# if locale changes or search algorithm changes we may want to reindex stuff
class ReindexSearch < Jobs::Scheduled
every 1.day
every 2.hours
def execute(args)
rebuild_problem_topics
@ -38,13 +38,14 @@ module Jobs
end
end
def rebuild_problem_posts(limit = 10000)
def rebuild_problem_posts(limit = 20000)
post_ids = load_problem_post_ids(limit)
post_ids.each do |id|
post = Post.find_by(id: id)
# could be deleted while iterating through batch
SearchIndexer.index(post, force: true) if post
if post = Post.find_by(id: id)
SearchIndexer.index(post, force: true)
end
end
end
@ -67,6 +68,7 @@ module Jobs
WHERE pd.post_id IS NULL
)', SiteSetting.default_locale, Search::INDEX_VERSION)
.limit(limit)
.order('posts.id DESC')
.pluck(:id)
end

View File

@ -223,11 +223,16 @@ class Topic < ActiveRecord::Base
ApplicationController.banner_json_cache.clear
end
if tags_changed || saved_change_to_attribute?(:category_id)
SearchIndexer.queue_post_reindex(self.id)
if tags_changed
TagUser.auto_watch(topic_id: id)
TagUser.auto_track(topic_id: id)
self.tags_changed = false
end
end
SearchIndexer.index(self)
UserActionCreator.log_topic(self)
@ -474,7 +479,7 @@ class Topic < ActiveRecord::Base
search_data = "#{title} #{raw[0...MAX_SIMILAR_BODY_LENGTH]}".strip
filter_words = Search.prepare_data(search_data)
ts_query = Search.ts_query(filter_words, nil, "|")
ts_query = Search.ts_query(term: filter_words, joiner: "|")
candidates = Topic
.visible

View File

@ -49,7 +49,7 @@ class UserSearch
if @term.present?
if SiteSetting.enable_names? && @term !~ /[_\.-]/
query = Search.ts_query(@term, "simple")
query = Search.ts_query(term: @term, ts_config: "simple")
users = users.includes(:user_search_data)
.references(:user_search_data)

View File

@ -1,3 +1,4 @@
# frozen_string_literal: true
require_dependency 'search'
class SearchIndexer
@ -14,111 +15,152 @@ class SearchIndexer
HtmlScrubber.scrub(html)
end
def self.update_index(table, id, raw_data)
raw_data = Search.prepare_data(raw_data, :index)
table_name = "#{table}_search_data"
foreign_key = "#{table}_id"
def self.inject_extra_terms(raw)
# insert some extra words for I.am.a.word so "word" is tokenized
# I.am.a.word becomes I.am.a.word am a word
search_data = raw_data.gsub(/[^[:space:]]*[\.]+[^[:space:]]*/) do |with_dot|
raw.gsub(/[^[:space:]]*[\.]+[^[:space:]]*/) do |with_dot|
split = with_dot.split(".")
if split.length > 1
with_dot + (" " << split[1..-1].join(" "))
with_dot + ((+" ") << split[1..-1].join(" "))
else
with_dot
end
end
end
def self.update_index(table: , id: , raw_data:)
search_data = raw_data.map do |data|
inject_extra_terms(Search.prepare_data(data || "", :index))
end
table_name = "#{table}_search_data"
foreign_key = "#{table}_id"
# for user login and name use "simple" lowercase stemmer
stemmer = table == "user" ? "simple" : Search.ts_config
ranked_index = <<~SQL
setweight(to_tsvector('#{stemmer}', coalesce(:a,'')), 'A') ||
setweight(to_tsvector('#{stemmer}', coalesce(:b,'')), 'B') ||
setweight(to_tsvector('#{stemmer}', coalesce(:c,'')), 'C') ||
setweight(to_tsvector('#{stemmer}', coalesce(:d,'')), 'D')
SQL
indexed_data = search_data.select { |d| d.length > 0 }.join(' ')
params = {
a: search_data[0],
b: search_data[1],
c: search_data[2],
d: search_data[3],
raw_data: indexed_data,
id: id,
locale: SiteSetting.default_locale,
version: Search::INDEX_VERSION
}
# Would be nice to use AR here but not sure how to execut Postgres functions
# when inserting data like this.
rows = Post.exec_sql_row_count("UPDATE #{table_name}
rows = Post.exec_sql_row_count(<<~SQL, params)
UPDATE #{table_name}
SET
raw_data = :raw_data,
locale = :locale,
search_data = TO_TSVECTOR('#{stemmer}', :search_data),
search_data = #{ranked_index},
version = :version
WHERE #{foreign_key} = :id",
raw_data: raw_data,
search_data: search_data,
id: id,
locale: SiteSetting.default_locale,
version: Search::INDEX_VERSION)
WHERE #{foreign_key} = :id
SQL
if rows == 0
Post.exec_sql("INSERT INTO #{table_name}
Post.exec_sql(<<~SQL, params)
INSERT INTO #{table_name}
(#{foreign_key}, search_data, locale, raw_data, version)
VALUES (:id, TO_TSVECTOR('#{stemmer}', :search_data), :locale, :raw_data, :version)",
raw_data: raw_data,
search_data: search_data,
id: id,
locale: SiteSetting.default_locale,
version: Search::INDEX_VERSION)
VALUES (:id, #{ranked_index}, :locale, :raw_data, :version)
SQL
end
rescue
# don't allow concurrency to mess up saving a post
# TODO is there any way we can safely avoid this?
# best way is probably pushing search indexer into a dedicated process so it no longer happens on save
# instead in the post processor
end
def self.update_topics_index(topic_id, title, cooked)
search_data = title.dup << " " << scrub_html_for_search(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH]
update_index('topic', topic_id, search_data)
scrubbed_cooked = scrub_html_for_search(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH]
# a bit inconsitent that we use title as A and body as B when in
# the post index body is C
update_index(table: 'topic', id: topic_id, raw_data: [title, scrubbed_cooked])
end
def self.update_posts_index(post_id, cooked, title, category)
search_data = scrub_html_for_search(cooked) << " " << title.dup.force_encoding('UTF-8')
search_data << " " << category if category
update_index('post', post_id, search_data)
def self.update_posts_index(post_id, title, category, tags, cooked)
update_index(table: 'post', id: post_id, raw_data: [title, category, tags, scrub_html_for_search(cooked)])
end
def self.update_users_index(user_id, username, name)
search_data = username.dup << " " << (name || "")
update_index('user', user_id, search_data)
update_index(table: 'user', id: user_id, raw_data: [username, name])
end
def self.update_categories_index(category_id, name)
update_index('category', category_id, name)
update_index(table: 'category', id: category_id, raw_data: [name])
end
def self.update_tags_index(tag_id, name)
update_index('tag', tag_id, name)
update_index(table: 'tag', id: tag_id, raw_data: [name])
end
def self.queue_post_reindex(topic_id)
return if @disabled
ActiveRecord::Base.exec_sql(<<~SQL, topic_id: topic_id)
UPDATE post_search_data
SET version = 0
WHERE post_id IN (SELECT id FROM posts WHERE topic_id = :topic_id)
SQL
end
def self.index(obj, force: false)
return if @disabled
if obj.class == Post && (obj.saved_change_to_cooked? || force)
if obj.topic
category_name = obj.topic.category.name if obj.topic.category
SearchIndexer.update_posts_index(obj.id, obj.cooked, obj.topic.title, category_name)
SearchIndexer.update_topics_index(obj.topic_id, obj.topic.title, obj.cooked) if obj.is_first_post?
category_name, tag_names = nil
topic = nil
if Topic === obj
topic = obj
elsif Post === obj
topic = obj.topic
end
category_name = topic.category&.name if topic
tag_names = topic.tags.pluck(:name).join(' ') if topic
if Post === obj && (obj.saved_change_to_cooked? || force)
if topic
SearchIndexer.update_posts_index(obj.id, topic.title, category_name, tag_names, obj.cooked)
SearchIndexer.update_topics_index(topic.id, topic.title, obj.cooked) if obj.is_first_post?
else
Rails.logger.warn("Orphan post skipped in search_indexer, topic_id: #{obj.topic_id} post_id: #{obj.id} raw: #{obj.raw}")
end
end
if obj.class == User && (obj.saved_change_to_username? || obj.saved_change_to_name? || force)
if User === obj && (obj.saved_change_to_username? || obj.saved_change_to_name? || force)
SearchIndexer.update_users_index(obj.id, obj.username_lower || '', obj.name ? obj.name.downcase : '')
end
if obj.class == Topic && (obj.saved_change_to_title? || force)
if Topic === obj && (obj.saved_change_to_title? || force)
if obj.posts
post = obj.posts.find_by(post_number: 1)
if post
category_name = obj.category.name if obj.category
SearchIndexer.update_posts_index(post.id, post.cooked, obj.title, category_name)
SearchIndexer.update_posts_index(post.id, obj.title, category_name, tag_names, post.cooked)
SearchIndexer.update_topics_index(obj.id, obj.title, post.cooked)
end
end
end
if obj.class == Category && (obj.saved_change_to_name? || force)
if Category === obj && (obj.saved_change_to_name? || force)
SearchIndexer.update_categories_index(obj.id, obj.name)
end
if obj.class == Tag && (obj.saved_change_to_name? || force)
if Tag === obj && (obj.saved_change_to_name? || force)
SearchIndexer.update_tags_index(obj.id, obj.name)
end
end
@ -127,14 +169,14 @@ class SearchIndexer
attr_reader :scrubbed
def initialize
@scrubbed = ""
@scrubbed = +""
end
def self.scrub(html)
me = new
parser = Nokogiri::HTML::SAX::Parser.new(me)
begin
copy = "<div>"
copy = +"<div>"
copy << html unless html.nil?
copy << "</div>"
parser.parse(html) unless html.nil?

View File

@ -1,7 +1,7 @@
require_dependency 'search/grouped_search_results'
class Search
INDEX_VERSION = 1.freeze
INDEX_VERSION = 2.freeze
def self.per_facet
5
@ -409,7 +409,7 @@ class Search
if match.to_s.length >= SiteSetting.min_search_term_length
posts.where("posts.id IN (
SELECT post_id FROM post_search_data pd1
WHERE pd1.search_data @@ #{Search.ts_query("##{match}")})")
WHERE pd1.search_data @@ #{Search.ts_query(term: "##{match}")})")
end
end
end
@ -511,12 +511,17 @@ class Search
end
end
@in_title = false
if word == 'order:latest' || word == 'l'
@order = :latest
nil
elsif word == 'order:latest_topic'
@order = :latest_topic
nil
elsif word == 'in:title'
@in_title = true
nil
elsif word =~ /topic:(\d+)/
topic_id = $1.to_i
if topic_id > 1
@ -681,7 +686,12 @@ class Search
posts = posts.joins('JOIN users u ON u.id = posts.user_id')
posts = posts.where("posts.raw || ' ' || u.username || ' ' || COALESCE(u.name, '') ilike ?", "%#{term_without_quote}%")
else
posts = posts.where("post_search_data.search_data @@ #{ts_query}")
# A is for title
# B is for category
# C is for tags
# D is for cooked
weights = @in_title ? 'A' : (SiteSetting.tagging_enabled ? 'ABCD' : 'ABD')
posts = posts.where("post_search_data.search_data @@ #{ts_query(weight_filter: weights)}")
exact_terms = @term.scan(/"([^"]+)"/).flatten
exact_terms.each do |exact|
posts = posts.where("posts.raw ilike ?", "%#{exact}%")
@ -743,11 +753,9 @@ class Search
posts = posts.order("posts.like_count DESC")
end
else
posts = posts.order("TS_RANK_CD(TO_TSVECTOR(#{default_ts_config}, topics.title), #{ts_query}) DESC")
data_ranking = "TS_RANK_CD(post_search_data.search_data, #{ts_query})"
if opts[:aggregate_search]
posts = posts.order("SUM(#{data_ranking}) DESC")
posts = posts.order("MAX(#{data_ranking}) DESC")
else
posts = posts.order("#{data_ranking} DESC")
end
@ -772,7 +780,7 @@ class Search
self.class.default_ts_config
end
def self.ts_query(term, ts_config = nil, joiner = "&")
def self.ts_query(term: , ts_config: nil, joiner: "&", weight_filter: nil)
data = Post.exec_sql("SELECT TO_TSVECTOR(:config, :term)",
config: 'simple',
@ -786,16 +794,17 @@ class Search
query = ActiveRecord::Base.connection.quote(
all_terms
.map { |t| "'#{PG::Connection.escape_string(t)}':*" }
.map { |t| "'#{PG::Connection.escape_string(t)}':*#{weight_filter}" }
.join(" #{joiner} ")
)
"TO_TSQUERY(#{ts_config || default_ts_config}, #{query})"
end
def ts_query(ts_config = nil)
def ts_query(ts_config = nil, weight_filter: nil)
@ts_query_cache ||= {}
@ts_query_cache["#{ts_config || default_ts_config} #{@term}"] ||= Search.ts_query(@term, ts_config)
@ts_query_cache["#{ts_config || default_ts_config} #{@term} #{weight_filter}"] ||=
Search.ts_query(term: @term, ts_config: ts_config, weight_filter: weight_filter)
end
def wrap_rows(query)

View File

@ -5,52 +5,37 @@ end
def reindex_search(db = RailsMultisite::ConnectionManagement.current_db)
puts "Reindexing '#{db}'"
puts ""
puts "Posts:"
Post.exec_sql("select p.id, p.cooked, c.name category, t.title, p.post_number, t.id topic_id from
posts p
join topics t on t.id = p.topic_id
left join categories c on c.id = t.category_id
").each do |p|
post_id = p["id"]
cooked = p["cooked"]
title = p["title"]
category = p["cat"]
post_number = p["post_number"].to_i
topic_id = p["topic_id"].to_i
SearchIndexer.update_posts_index(post_id, cooked, title, category)
SearchIndexer.update_topics_index(topic_id, title , cooked) if post_number == 1
puts "Posts"
Post.includes(topic: [:category, :tags]).find_each do |p|
if p.post_number == 1
SearchIndexer.index(p.topic, force: true)
else
SearchIndexer.index(p, force: true)
end
putc "."
end
puts
puts "Users:"
User.exec_sql("select id, name, username from users").each do |u|
id = u["id"]
name = u["name"]
username = u["username"]
SearchIndexer.update_users_index(id, username, name)
puts "Users"
User.find_each do |u|
SearchIndexer.index(u, force: true)
putc "."
end
puts
puts "Categories"
Category.exec_sql("select id, name from categories").each do |c|
id = c["id"]
name = c["name"]
SearchIndexer.update_categories_index(id, name)
putc '.'
Category.find_each do |c|
SearchIndexer.index(c, force: true)
putc "."
end
puts '', 'Tags'
puts
puts "Tags"
Tag.exec_sql('select id, name from tags').each do |t|
SearchIndexer.update_tags_index(t['id'], t['name'])
putc '.'
Tag.find_each do |t|
SearchIndexer.index(t, force: true)
putc "."
end
puts

View File

@ -395,6 +395,27 @@ describe Search do
let(:tag_group) { Fabricate(:tag_group) }
let(:category) { Fabricate(:category) }
context 'post searching' do
it 'can find posts with tags' do
SiteSetting.tagging_enabled = true
post = Fabricate(:post, raw: 'I am special post')
DiscourseTagging.tag_topic_by_names(post.topic, Guardian.new(Fabricate.build(:admin)), [tag.name])
post.topic.save
# we got to make this index (it is deferred)
Jobs::ReindexSearch.new.rebuild_problem_posts
result = Search.execute(tag.name)
expect(result.posts.length).to eq(1)
SiteSetting.tagging_enabled = false
result = Search.execute(tag.name)
expect(result.posts.length).to eq(0)
end
end
context 'tagging is disabled' do
before { SiteSetting.tagging_enabled = false }
@ -856,7 +877,7 @@ describe Search do
str = " grigio:babel deprecated? "
str << "page page on Atmosphere](https://atmospherejs.com/grigio/babel)xxx: aaa.js:222 aaa'\"bbb"
ts_query = Search.ts_query(str, "simple")
ts_query = Search.ts_query(term: str, ts_config: "simple")
Post.exec_sql("SELECT to_tsvector('bbb') @@ " << ts_query)
end
@ -923,6 +944,19 @@ describe Search do
end
end
context 'in:title' do
it 'allows for search in title' do
topic = Fabricate(:topic, title: 'I am testing a title search')
_post = Fabricate(:post, topic_id: topic.id, raw: 'this is the first post')
results = Search.execute('title in:title')
expect(results.posts.length).to eq(1)
results = Search.execute('first in:title')
expect(results.posts.length).to eq(0)
end
end
context 'pagination' do
let(:number_of_results) { 2 }
let!(:post1) { Fabricate(:post, raw: 'hello hello hello hello hello') }

View File

@ -7,7 +7,7 @@ describe SearchIndexer do
data = "你好世界"
expect(data.split(" ").length).to eq(1)
SearchIndexer.update_posts_index(post_id, "你好世界", "", nil)
SearchIndexer.update_posts_index(post_id, "你好世界", "", "", nil)
raw_data = PostSearchData.where(post_id: post_id).pluck(:raw_data)[0]
expect(raw_data.split(' ').length).to eq(2)
@ -15,18 +15,18 @@ describe SearchIndexer do
it 'correctly indexes a post according to version' do
# Preparing so that they can be indexed to right version
SearchIndexer.update_posts_index(post_id, "dummy", "", nil)
SearchIndexer.update_posts_index(post_id, "dummy", "", nil, nil)
PostSearchData.find_by(post_id: post_id).update_attributes!(version: -1)
data = "<a>This</a> is a test"
SearchIndexer.update_posts_index(post_id, data, "", nil)
SearchIndexer.update_posts_index(post_id, "", "", nil, data)
raw_data, locale, version = PostSearchData.where(post_id: post_id).pluck(:raw_data, :locale, :version)[0]
expect(raw_data).to eq("This is a test")
expect(locale).to eq("en")
expect(version).to eq(Search::INDEX_VERSION)
SearchIndexer.update_posts_index(post_id, "tester", "", nil)
SearchIndexer.update_posts_index(post_id, "tester", "", nil, nil)
raw_data = PostSearchData.where(post_id: post_id).pluck(:raw_data)[0]
expect(raw_data).to eq("tester")