2019-04-29 20:27:42 -04:00
# frozen_string_literal: true
2022-07-27 22:27:38 -04:00
RSpec . describe SearchIndexer do
2017-08-16 07:38:34 -04:00
let ( :post_id ) { 99 }
2018-09-13 12:53:53 -04:00
2019-04-01 21:52:59 -04:00
before { SearchIndexer . enable }
after { SearchIndexer . disable }
2014-06-24 03:10:56 -04:00
it " correctly indexes chinese " do
SiteSetting . default_locale = " zh_CN "
data = " 你好世界 "
2020-08-18 02:51:17 -04:00
SearchIndexer . update_posts_index (
post_id : post_id ,
topic_title : " " ,
category_name : " " ,
topic_tags : " " ,
cooked : data ,
private_message : false ,
)
2014-06-24 03:10:56 -04:00
2020-07-17 04:27:30 -04:00
post_search_data = PostSearchData . find_by ( post_id : post_id )
expect ( post_search_data . raw_data ) . to eq ( " 你好 世界 " )
expect ( post_search_data . search_data ) . to eq ( " '世界':2 '你好':1 " )
2014-06-24 03:10:56 -04:00
end
2018-04-26 01:46:52 -04:00
it " extract youtube title " do
html =
2023-03-29 11:54:25 -04:00
" <div class= \" lazy-video-container \" data-video-id= \" lmFgeFh2nlw \" data-video-title= \" Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive] \" data-provider-name= \" youtube \" ></div> "
2022-03-07 16:03:10 -05:00
scrubbed = SearchIndexer :: HtmlScrubber . scrub ( html )
2018-09-17 04:31:15 -04:00
expect ( scrubbed ) . to eq (
" Metallica Mixer Explains Missing Bass on 'And Justice for All' [Exclusive] " ,
)
2018-04-26 01:46:52 -04:00
end
2018-08-19 20:39:19 -04:00
it " extract a link " do
html = " <a href='http://meta.discourse.org/'>link</a> "
2022-03-07 16:03:10 -05:00
scrubbed = SearchIndexer :: HtmlScrubber . scrub ( html )
2018-09-17 04:31:15 -04:00
expect ( scrubbed ) . to eq ( " http://meta.discourse.org/ link " )
2018-08-19 20:39:19 -04:00
end
2022-04-06 16:06:45 -04:00
it " ignores autogenerated link anchors " do
html = " <a class='anchor' href=' # something-special'>something special</a> "
scrubbed = SearchIndexer :: HtmlScrubber . scrub ( html )
expect ( scrubbed ) . to eq ( " something special " )
end
2019-04-29 11:15:55 -04:00
it " extracts @username from mentions " do
html =
'<p><a class="mention" href="/u/%E7%8B%AE%E5%AD%90">@狮子</a> <a class="mention" href="/u/foo">@foo</a></p>'
2022-03-07 16:03:10 -05:00
scrubbed = SearchIndexer :: HtmlScrubber . scrub ( html )
2019-04-29 11:15:55 -04:00
expect ( scrubbed ) . to eq ( " @狮子 @foo " )
end
it " extracts @groupname from group mentions " do
html =
'<p><a class="mention-group" href="/groups/%D0%B0%D0%B2%D1%82%D0%BE%D0%BC%D0%BE%D0%B1%D0%B8%D0%BB%D0%B8%D1%81%D1%82">@автомобилист</a></p>'
2022-03-07 16:03:10 -05:00
scrubbed = SearchIndexer :: HtmlScrubber . scrub ( html )
2019-04-29 11:15:55 -04:00
expect ( scrubbed ) . to eq ( " @автомобилист " )
end
2019-04-29 11:26:29 -04:00
it " extracts emoji name from emoji image " do
2022-01-24 00:35:30 -05:00
emoji = Emoji [ " wink " ]
2022-02-09 06:18:59 -05:00
html =
%Q|<img src=\"#{URI.join(Discourse.base_url_no_prefix, emoji.url)}\" title=\":wink:\" class=\"emoji only-emoji\" alt=\":wink:\" loading=\"lazy\" width=\"20\" height=\"20\">|
2022-03-07 16:03:10 -05:00
scrubbed = SearchIndexer :: HtmlScrubber . scrub ( html )
2022-01-24 00:35:30 -05:00
2019-04-29 11:26:29 -04:00
expect ( scrubbed ) . to eq ( " :wink: " )
end
2018-09-13 12:53:53 -04:00
it " doesn't index local files " do
html = << ~ HTML
< p > < img src = " https://www.discourse.org/logo.png " alt = " Discourse " > < / p>
< p > < img src = " #{ Discourse . base_url_no_prefix } /uploads/episodeinteractive/original/3X/0/f/0f40b818356bdc1d80acfa905034e95cfd112a3a.png " alt = " 51%20PM " width = " 289 " height = " 398 " > < / p>
< div class = " lightbox-wrapper " >
< a class = " lightbox " href = " #{ Discourse . base_url_no_prefix } /uploads/episodeinteractive/original/3X/1/6/16790095df3baf318fb2eb1d7e5d7860dc45d48b.jpg " data - download - href = " #{ Discourse . base_url_no_prefix } /uploads/episodeinteractive/16790095df3baf318fb2eb1d7e5d7860dc45d48b " title = " Untitled design (21).jpg " rel = " nofollow noopener " >
< img src = " #{ Discourse . base_url_no_prefix } /uploads/episodeinteractive/optimized/3X/1/6/16790095df3baf318fb2eb1d7e5d7860dc45d48b_1_563x500.jpg " alt = " Untitled%20design%20(21) " width = " 563 " height = " 500 " >
< div class = " meta " >
2021-11-24 23:22:43 -05:00
< svg class = " fa d-icon d-icon-far-image svg-icon " aria - hidden = " true " > < use href = " # far-image " > < / use>< /s vg >
2018-09-13 12:53:53 -04:00
< span class = " filename " > Untitled design ( 21 ) . jpg < / span>
< span class = " informations " > 1280 x1136 472 KB < / span>
2021-11-24 23:22:43 -05:00
< svg class = " fa d-icon d-icon-discourse-expand svg-icon " aria - hidden = " true " > < use href = " # discourse-expand " > < / use>< /s vg >
2018-09-13 12:53:53 -04:00
< / div>
< / a>
< / div>
HTML
2022-03-07 16:03:10 -05:00
scrubbed = SearchIndexer :: HtmlScrubber . scrub ( html )
2018-09-13 12:53:53 -04:00
2019-04-01 04:18:54 -04:00
expect ( scrubbed ) . to eq ( " Discourse 51%20PM Untitled%20design%20(21) " )
2018-09-13 12:53:53 -04:00
end
2017-08-16 07:38:34 -04:00
it " correctly indexes a post according to version " do
# Preparing so that they can be indexed to right version
2020-08-18 02:51:17 -04:00
SearchIndexer . update_posts_index (
post_id : post_id ,
topic_title : " dummy " ,
category_name : " " ,
topic_tags : nil ,
cooked : nil ,
private_message : false ,
)
2019-04-29 03:32:25 -04:00
PostSearchData . find_by ( post_id : post_id ) . update! ( version : - 1 )
2014-06-24 03:10:56 -04:00
2017-08-16 07:38:34 -04:00
data = " <a>This</a> is a test "
2020-08-18 02:51:17 -04:00
SearchIndexer . update_posts_index (
post_id : post_id ,
topic_title : " " ,
category_name : " " ,
topic_tags : nil ,
cooked : data ,
private_message : false ,
)
2014-06-24 03:10:56 -04:00
2017-08-16 07:38:34 -04:00
raw_data , locale , version =
PostSearchData . where ( post_id : post_id ) . pluck ( :raw_data , :locale , :version ) [ 0 ]
2016-08-10 15:40:58 -04:00
expect ( raw_data ) . to eq ( " This is a test " )
2019-05-15 17:43:00 -04:00
expect ( locale ) . to eq ( SiteSetting . default_locale )
2020-07-23 02:10:05 -04:00
expect ( version ) . to eq ( SearchIndexer :: POST_INDEX_VERSION )
2014-06-24 03:10:56 -04:00
end
2019-03-19 05:16:57 -04:00
describe " .index " do
2020-07-27 03:22:54 -04:00
let ( :topic ) { Fabricate ( :topic , title : " this is a title that I am testing " ) }
let ( :post ) { Fabricate ( :post , topic : topic ) }
2019-03-19 05:16:57 -04:00
it " should index posts correctly " do
expect { post } . to change { PostSearchData . count } . by ( 1 )
expect { post . update! ( raw : " this is new content " ) } . to change {
2020-07-17 04:27:30 -04:00
post . reload . post_search_data . search_data
}
2019-03-19 05:16:57 -04:00
expect { post . update! ( topic_id : Fabricate ( :topic ) . id ) } . to change {
2020-07-17 04:27:30 -04:00
post . reload . post_search_data . search_data
}
2019-03-19 05:16:57 -04:00
end
2019-03-31 22:06:27 -04:00
2023-03-20 00:43:08 -04:00
it " should work with edge case domain names " do
# 00E5A4 stems to 00e5 and a4, which is odd, but by-design
# this may cause internal indexing to fail due to indexes not aligning
# when stuffing terms for domains
post . update! ( cooked : << ~ HTML )
Test . 00 E5A4 . 1
HTML
SearchIndexer . update_posts_index (
post_id : post . id ,
topic_title : post . topic . title ,
category_name : post . topic . category & . name ,
topic_tags : post . topic . tags . map ( & :name ) . join ( " " ) ,
cooked : post . cooked ,
private_message : post . topic . private_message? ,
)
end
2021-04-07 03:02:00 -04:00
it " should work with invalid HTML " do
2021-08-04 21:46:25 -04:00
post . update! ( cooked : " <FD> " * Nokogiri :: Gumbo :: DEFAULT_MAX_TREE_DEPTH )
2021-04-07 03:02:00 -04:00
SearchIndexer . update_posts_index (
post_id : post . id ,
topic_title : post . topic . title ,
category_name : post . topic . category & . name ,
topic_tags : post . topic . tags . map ( & :name ) . join ( " " ) ,
cooked : post . cooked ,
private_message : post . topic . private_message? ,
)
end
2019-03-31 22:06:27 -04:00
it " should not index posts with empty raw " do
expect do
post = Fabricate . build ( :post , raw : " " , post_type : Post . types [ :small_action ] )
post . save! ( validate : false )
end . to_not change { PostSearchData . count }
end
2019-03-31 22:14:29 -04:00
it " should not tokenize urls and duplicate title and href in <a> " do
2020-07-27 03:22:54 -04:00
post . update! ( raw : << ~ RAW )
2019-03-31 22:14:29 -04:00
https : / /me ta . discourse . org / some . png
RAW
post . rebake!
post . reload
expect ( post . post_search_data . raw_data ) . to eq ( " https://meta.discourse.org/some.png " )
2020-07-27 03:22:54 -04:00
2023-02-02 17:55:28 -05:00
expect ( post . post_search_data . search_data ) . to eq_ts_vector (
" '/some.png':12 'discourse.org':11 'meta.discourse.org':11 'meta.discourse.org/some.png':10 'org':11 'test':8A 'titl':4A 'uncategor':9B 'meta':11 'discours':11 " ,
2020-07-27 03:22:54 -04:00
)
2020-07-09 02:56:02 -04:00
end
2020-07-27 02:46:44 -04:00
it " should not tokenize versions " do
2020-07-27 03:17:49 -04:00
post . update! ( raw : " 123.223 " )
2020-07-27 02:46:44 -04:00
expect ( post . post_search_data . search_data ) . to eq (
2020-07-27 03:17:49 -04:00
" '123.223':10 'test':8A 'titl':4A 'uncategor':9B " ,
)
post . update! ( raw : " 15.2.231.423 " )
post . reload
expect ( post . post_search_data . search_data ) . to eq (
" '15.2.231.423':10 'test':8A 'titl':4A 'uncategor':9B " ,
2020-07-27 02:46:44 -04:00
)
end
2020-07-09 05:02:02 -04:00
it " should tokenize host of a URL and removes query string " do
2020-07-09 02:56:02 -04:00
category = Fabricate ( :category , name : " awesome category " )
topic = Fabricate ( :topic , category : category , title : " this is a test topic " )
post = Fabricate ( :post , topic : topic , raw : << ~ RAW )
2023-02-02 17:55:28 -05:00
a https : / / car . com? bob = 1 , http : / /e fg . com . au? bill = 1 b hij . net / xyz = 1
www . klm . net / ? IGNORE = 1 < a href = " http://abc.de.nop.co.uk?IGNORE=1&ignore2=2 " > test < / a> https: / / cars . com
2020-07-09 02:56:02 -04:00
RAW
post . rebake!
post . reload
topic = post . topic
2020-10-07 20:40:13 -04:00
# Note, a random non URL string should be tokenized properly,
# hence www.klm.net?IGNORE=1 it was inserted in autolinking.
# We could consider amending the auto linker to add
# more context to say "hey, this part of <a href>...</a> was a guess by autolinker.
# A blanket treating of non-urls without this logic is risky.
2020-07-09 02:56:02 -04:00
expect ( post . post_search_data . raw_data ) . to eq (
2023-02-02 17:55:28 -05:00
" a https://car.com , http://efg.com.au b http://hij.net/xyz=1 hij.net/xyz=1 http://www.klm.net/ www.klm.net/?IGNORE=1 http://abc.de.nop.co.uk test https://cars.com " ,
2020-07-09 02:56:02 -04:00
)
2023-02-02 17:55:28 -05:00
expect ( post . post_search_data . search_data ) . to eq_ts_vector (
" '/?ignore=1':21 '/xyz=1':14,17 'car.com':9 'cars.com':24 'abc.de.nop.co.uk':22 'au':10 'awesom':6B 'b':11 'categori':7B 'co.uk':22 'com':9,10,24 'com.au':10 'de.nop.co.uk':22 'efg.com.au':10 'hij.net':13,16 'hij.net/xyz=1':12,15 'klm.net':18,20 'net':13,16,18,20 'nop.co.uk':22 'test':4A,23 'topic':5A 'uk':22 'www.klm.net':18,20 'www.klm.net/?ignore=1':19 'car':9,24 'co':22 'de':22 'efg':10 'hij':13,16 'klm':18,20 'nop':22 'www':18,20 'abc':22 " ,
2019-03-31 22:14:29 -04:00
)
end
it " should not include lightbox in search " do
Jobs . run_immediately!
2019-03-31 22:32:25 -04:00
SiteSetting . max_image_width = 1
2023-02-15 23:02:03 -05:00
stub_request ( :get , " https://1.2.3.4/some.png " ) . to_return (
2019-03-31 22:32:25 -04:00
status : 200 ,
body : file_from_fixtures ( " logo.png " ) . read ,
)
2019-03-31 22:14:29 -04:00
src = " https://meta.discourse.org/some.png "
post = Fabricate ( :post , raw : << ~ RAW )
Let me see how I can fix this image
2019-04-01 04:18:54 -04:00
< img src = " #{ src } " title = " GOT " alt = " white walkers " width = " 2 " height = " 2 " >
2019-03-31 22:14:29 -04:00
RAW
post . rebake!
post . reload
2019-03-31 22:32:25 -04:00
expect ( post . cooked ) . not_to include ( CookedPostProcessor :: LIGHTBOX_WRAPPER_CSS_CLASS )
2019-03-31 22:14:29 -04:00
expect ( post . post_search_data . raw_data ) . to eq (
2020-07-17 04:27:30 -04:00
" Let me see how I can fix this image white walkers GOT " ,
2019-03-31 22:14:29 -04:00
)
end
2020-08-06 00:25:03 -04:00
it " should strips audio and videos URLs from raw data " do
SiteSetting . authorized_extensions = " mp4 "
2020-10-07 20:40:13 -04:00
Fabricate ( :video_upload )
2020-08-06 00:25:03 -04:00
post . update! ( raw : << ~ RAW )
link to an external page : https : / / google . com / ?u = bar
link to an audio file : https : / /somesi te . com / audio . m4a
link to a video file : https : / /somesi te . com / content / somethingelse . MOV
link to an invalid URL : http : error ]
RAW
expect ( post . post_search_data . raw_data ) . to eq (
" link to an external page: https://google.com/ link to an audio file: #{ I18n . t ( " search.audio " ) } link to a video file: #{ I18n . t ( " search.video " ) } link to an invalid URL: http:error] " ,
)
2023-02-02 17:55:28 -05:00
expect ( post . post_search_data . search_data ) . to eq_ts_vector (
" '/audio.m4a':23 '/content/somethingelse.mov':31 'audio':19 'com':15,22,30 'error':38 'extern':13 'file':20,28 'google.com':15 'http':37 'invalid':35 'link':10,16,24,32 'page':14 'somesite.com':22,30 'somesite.com/audio.m4a':21 'somesite.com/content/somethingelse.mov':29 'test':8A 'titl':4A 'uncategor':9B 'url':36 'video':27 'googl':15 'somesit':22,30 " ,
2020-08-06 00:25:03 -04:00
)
end
2022-03-07 16:03:10 -05:00
it " should unaccent indexed content " do
SiteSetting . search_ignore_accents = true
post . update! ( raw : " Cette oeuvre d'art n'est pas une œuvre " )
post . post_search_data . reload
expect ( post . post_search_data . search_data ) . not_to include ( " œuvr " )
expect ( post . post_search_data . search_data ) . to include ( " oeuvr " )
SiteSetting . search_ignore_accents = false
SearchIndexer . index ( post , force : true )
post . post_search_data . reload
expect ( post . post_search_data . search_data ) . to include ( " œuvr " )
expect ( post . post_search_data . search_data ) . to include ( " oeuvr " )
end
2022-04-06 13:23:30 -04:00
it " truncates long words in the index " do
SiteSetting . search_max_indexed_word_length = 4
title = " A title that is long enough "
contents = " I am the best beige object http://example.com/long/url "
topic . update! ( title : title )
post . update! ( raw : contents )
post_search_data = post . post_search_data
post_search_data . reload
expect ( post_search_data . raw_data ) . to eq ( contents )
words = post_search_data . search_data . scan ( / '([^']*)' / ) . map { | match | match [ 0 ] }
expect ( words ) . to contain_exactly (
" best " ,
" beig " ,
" obj " ,
" http " ,
" titl " ,
" long " ,
" enou " ,
" unca " ,
)
end
2023-01-30 20:41:31 -05:00
it " limits number of repeated terms when max_duplicate_search_index_terms site setting has been configured " do
SiteSetting . max_duplicate_search_index_terms = 5
2023-02-01 20:17:19 -05:00
contents = << ~ TEXT
#{"sam " * 10}
< a href = " https://something.com/path:path'path?term='hello' " > url < / a>
2023-02-02 17:55:28 -05:00
< a href = " https://somethings.com/path:path'path?term='hello' " > url < / a>
2023-02-01 20:17:19 -05:00
TEXT
2023-01-30 20:41:31 -05:00
post . update! ( raw : contents )
post_search_data = post . post_search_data
post_search_data . reload
2023-02-01 20:17:19 -05:00
terms =
2023-02-02 17:55:28 -05:00
" '/path:path''path':22,26 'com':21,25 'sam':10,11,12,13,14 'something.com':21 'something.com/path:path''path':20 'test':8A 'titl':4A 'uncategor':9B 'url':23,27 'someth':21,25 'somethings.com':25 'somethings.com/path:path''path':24 "
2023-02-01 20:17:19 -05:00
2023-02-02 17:55:28 -05:00
expect ( post_search_data . search_data ) . to eq_ts_vector ( terms )
2023-01-30 20:41:31 -05:00
end
2019-03-19 05:16:57 -04:00
end
2019-04-01 21:52:59 -04:00
describe " .queue_post_reindex " do
let ( :post ) { Fabricate ( :post ) }
let ( :topic ) { post . topic }
it " should reset the version of search data for all posts in the topic " do
post2 = Fabricate ( :post )
SearchIndexer . queue_post_reindex ( topic . id )
expect ( post . reload . post_search_data . version ) . to eq ( SearchIndexer :: REINDEX_VERSION )
expect ( post2 . reload . post_search_data . version ) . to eq ( SearchIndexer :: POST_INDEX_VERSION )
end
end
2021-04-27 01:52:45 -04:00
describe " .queue_users_reindex " do
let! ( :user ) { Fabricate ( :user ) }
let! ( :user2 ) { Fabricate ( :user ) }
it " should reset the version of search data for all users " do
SearchIndexer . index ( user , force : true )
SearchIndexer . index ( user2 , force : true )
SearchIndexer . queue_users_reindex ( [ user . id ] )
expect ( user . reload . user_search_data . version ) . to eq ( SearchIndexer :: REINDEX_VERSION )
expect ( user2 . reload . user_search_data . version ) . to eq ( SearchIndexer :: USER_INDEX_VERSION )
end
end
2014-06-24 03:10:56 -04:00
end