BUGFIX: Chinese search was broken
BUGFIX: User locale was used index data BUGFIX: missing Norwegian fulltext config FEATURE: store the text used to index stuff in fulltext (for diagnostics / in page search) FEATURE: re-index posts when locale changes (in bg job) FEATURE: allow reindexing by trucating post_search_data Note: I removed japanese specific config cause it requires custom pg config, happy to add it once our base docker config ships with it
This commit is contained in:
parent
51ff644409
commit
3c84876660
2
Gemfile
2
Gemfile
|
@ -223,6 +223,8 @@ gem 'gctools', require: false, platform: :mri_21
|
|||
gem 'stackprof', require: false, platform: :mri_21
|
||||
gem 'memory_profiler', require: false, platform: :mri_21
|
||||
|
||||
gem 'rmmseg-cpp', require: false
|
||||
|
||||
# This silly path comment just makes it easier for me to do dev
|
||||
# will be removed in a few weeks
|
||||
gem 'logster'#, path: '../logster'
|
||||
|
|
|
@ -283,6 +283,7 @@ GEM
|
|||
rest-client (1.6.7)
|
||||
mime-types (>= 1.16)
|
||||
rinku (1.7.3)
|
||||
rmmseg-cpp (0.2.9)
|
||||
rspec (2.14.1)
|
||||
rspec-core (~> 2.14.0)
|
||||
rspec-expectations (~> 2.14.0)
|
||||
|
@ -462,6 +463,7 @@ DEPENDENCIES
|
|||
redis
|
||||
rest-client
|
||||
rinku
|
||||
rmmseg-cpp
|
||||
rspec-given
|
||||
rspec-rails
|
||||
ruby-readability
|
||||
|
|
|
@ -31,6 +31,8 @@ module Jobs
|
|||
unless UserAvatar.where("last_gravatar_download_attempt IS NULL").limit(1).first
|
||||
Post.rebake_old(250)
|
||||
end
|
||||
|
||||
Search.rebuild_problem_posts
|
||||
end
|
||||
|
||||
end
|
||||
|
|
|
@ -8,6 +8,8 @@ class SearchObserver < ActiveRecord::Observer
|
|||
end
|
||||
|
||||
def self.update_index(table, id, search_data)
|
||||
search_data = Search.prepare_data(search_data)
|
||||
|
||||
table_name = "#{table}_search_data"
|
||||
foreign_key = "#{table}_id"
|
||||
|
||||
|
@ -16,9 +18,18 @@ class SearchObserver < ActiveRecord::Observer
|
|||
|
||||
# Would be nice to use AR here but not sure how to execut Postgres functions
|
||||
# when inserting data like this.
|
||||
rows = Post.exec_sql_row_count("UPDATE #{table_name} SET search_data = TO_TSVECTOR('#{stemmer}', ?) WHERE #{foreign_key} = ?", search_data, id)
|
||||
rows = Post.exec_sql_row_count("UPDATE #{table_name}
|
||||
SET
|
||||
raw_data = :search_data,
|
||||
locale = :locale,
|
||||
search_data = TO_TSVECTOR('#{stemmer}', :search_data)
|
||||
WHERE #{foreign_key} = :id",
|
||||
search_data: search_data, id: id, locale: SiteSetting.default_locale)
|
||||
if rows == 0
|
||||
Post.exec_sql("INSERT INTO #{table_name} (#{foreign_key}, search_data) VALUES (?, TO_TSVECTOR('#{stemmer}', ?))", id, search_data)
|
||||
Post.exec_sql("INSERT INTO #{table_name}
|
||||
(#{foreign_key}, search_data, locale, raw_data)
|
||||
VALUES (:id, TO_TSVECTOR('#{stemmer}', :search_data), :locale, :search_data)",
|
||||
search_data: search_data, id: id, locale: SiteSetting.default_locale)
|
||||
end
|
||||
rescue
|
||||
# don't allow concurrency to mess up saving a post
|
||||
|
@ -39,7 +50,7 @@ class SearchObserver < ActiveRecord::Observer
|
|||
update_index('category', category_id, name)
|
||||
end
|
||||
|
||||
def after_save(obj)
|
||||
def self.index(obj)
|
||||
if obj.class == Post && obj.cooked_changed?
|
||||
if obj.topic
|
||||
category_name = obj.topic.category.name if obj.topic.category
|
||||
|
@ -67,6 +78,10 @@ class SearchObserver < ActiveRecord::Observer
|
|||
end
|
||||
end
|
||||
|
||||
def after_save(object)
|
||||
SearchObserver.index(object)
|
||||
end
|
||||
|
||||
|
||||
class HtmlScrubber < Nokogiri::XML::SAX::Document
|
||||
attr_reader :scrubbed
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
class AddRawDataToSearch < ActiveRecord::Migration
|
||||
def change
|
||||
add_column :post_search_data, :raw_data, :text
|
||||
add_column :user_search_data, :raw_data, :text
|
||||
add_column :category_search_data, :raw_data, :text
|
||||
|
||||
add_column :post_search_data, :locale, :string
|
||||
add_column :user_search_data, :locale, :text
|
||||
add_column :category_search_data, :locale, :text
|
||||
end
|
||||
end
|
|
@ -19,25 +19,67 @@ class Search
|
|||
end
|
||||
|
||||
def self.long_locale
|
||||
case I18n.locale # Currently-present in /conf/locales/* only, sorry :-( Add as needed
|
||||
# if adding a language see:
|
||||
# /usr/share/postgresql/9.3/tsearch_data for possible options
|
||||
# Do not add languages that are missing without amending the
|
||||
# base docker config
|
||||
#
|
||||
case SiteSetting.default_locale.to_sym
|
||||
when :da then 'danish'
|
||||
when :de then 'german'
|
||||
when :en then 'english'
|
||||
when :es then 'spanish'
|
||||
when :fr then 'french'
|
||||
when :it then 'italian'
|
||||
when :ja then 'japanese'
|
||||
when :nl then 'dutch'
|
||||
when :nb_NO then 'norwegian'
|
||||
when :pt then 'portuguese'
|
||||
when :pt_BR then 'portuguese'
|
||||
when :sv then 'swedish'
|
||||
when :ru then 'russian'
|
||||
else 'simple' # use the 'simple' stemmer for other languages
|
||||
end
|
||||
end
|
||||
|
||||
def self.rebuild_problem_posts(limit = 10000)
|
||||
posts = Post.joins(:topic)
|
||||
.where('posts.id NOT IN (
|
||||
SELECT post_id from post_search_data
|
||||
WHERE locale = ?
|
||||
)', SiteSetting.default_locale).limit(10000)
|
||||
|
||||
posts.each do |post|
|
||||
post.cooked += " "
|
||||
SearchObserver.index(post)
|
||||
end
|
||||
|
||||
nil
|
||||
end
|
||||
|
||||
def self.prepare_data(search_data)
|
||||
data = search_data.squish
|
||||
# TODO rmmseg is designed for chinese, we need something else for Korean / Japanese
|
||||
if ['zh_TW', 'zh_CN', 'ja', 'ko'].include?(SiteSetting.default_locale)
|
||||
unless defined? RMMSeg
|
||||
require 'rmmseg'
|
||||
RMMSeg::Dictionary.load_dictionaries
|
||||
end
|
||||
|
||||
algo = RMMSeg::Algorithm.new(search_data)
|
||||
|
||||
data = ""
|
||||
while token = algo.next_token
|
||||
data << token.text << " "
|
||||
end
|
||||
end
|
||||
|
||||
data.force_encoding("UTF-8")
|
||||
data
|
||||
end
|
||||
|
||||
def initialize(term, opts=nil)
|
||||
if term.present?
|
||||
@term = term.to_s
|
||||
@term = Search.prepare_data(term.to_s)
|
||||
@original_term = PG::Connection.escape_string(@term)
|
||||
end
|
||||
|
||||
|
|
|
@ -77,6 +77,10 @@ class SqlBuilder
|
|||
16 => :value_to_boolean
|
||||
}
|
||||
|
||||
def self.map_exec(klass, sql, args = {})
|
||||
self.new(sql).map_exec(klass, args)
|
||||
end
|
||||
|
||||
def map_exec(klass = OpenStruct, args = {})
|
||||
results = exec(args)
|
||||
|
||||
|
|
|
@ -0,0 +1,40 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe SearchObserver do
|
||||
|
||||
def get_row(post_id)
|
||||
SqlBuilder.map_exec(
|
||||
OpenStruct,
|
||||
"select * from post_search_data where post_id = :post_id",
|
||||
post_id: post_id
|
||||
).first
|
||||
end
|
||||
|
||||
it 'correctly indexes chinese' do
|
||||
SiteSetting.default_locale = 'zh_CN'
|
||||
data = "你好世界"
|
||||
data.split(" ").length.should == 1
|
||||
|
||||
SearchObserver.update_posts_index(99, "你好世界", "", nil)
|
||||
|
||||
row = get_row(99)
|
||||
row.raw_data.split(' ').length.should == 2
|
||||
end
|
||||
|
||||
it 'correctly indexes a post' do
|
||||
data = "<a>This</a> is a test"
|
||||
|
||||
SearchObserver.update_posts_index(99, data, "", nil)
|
||||
|
||||
row = get_row(99)
|
||||
|
||||
row.raw_data.should == "This is a test"
|
||||
row.locale.should == "en"
|
||||
|
||||
SearchObserver.update_posts_index(99, "tester", "", nil)
|
||||
|
||||
row = get_row(99)
|
||||
|
||||
row.raw_data.should == "tester"
|
||||
end
|
||||
end
|
Loading…
Reference in New Issue