From ce79a1879028e1fc470b4d6513b3d40358f2a465 Mon Sep 17 00:00:00 2001 From: Sam Date: Tue, 18 Feb 2025 09:22:57 +1100 Subject: [PATCH] FEATURE: Native PDF support (#1127) * FEATURE: Native PDF support This amends it so we use PDF Reader gem to extract text from PDFs * This means that our simple pdf eval passes at last * fix spec * skip test in CI * test file support * Update lib/utils/image_to_text.rb Co-authored-by: Alan Guo Xiang Tan * address pr comments --------- Co-authored-by: Alan Guo Xiang Tan --- .../admin/ai_personas_controller.rb | 2 +- .../rag_document_fragments_controller.rb | 4 +- app/jobs/regular/digest_rag_upload.rb | 16 +-- .../ai_custom_tool_list_serializer.rb | 2 +- .../components/ai-persona-editor.gjs | 4 +- .../discourse/components/ai-tool-editor.gjs | 4 +- .../discourse/components/rag-options.gjs | 2 +- .../discourse/components/rag-uploader.gjs | 10 +- config/locales/client.en.yml | 4 +- config/settings.yml | 2 +- evals/lib/eval.rb | 21 +--- lib/utils/image_to_text.rb | 24 +++- lib/utils/pdf_to_images.rb | 5 +- lib/utils/pdf_to_text.rb | 112 ++++++++++++++++++ plugin.rb | 10 ++ spec/fixtures/rag/2-page.pdf | Bin 0 -> 13529 bytes spec/jobs/regular/digest_rag_upload_spec.rb | 8 +- spec/lib/utils/pdf_to_text_spec.rb | 62 ++++++++++ .../rag_document_fragments_controller_spec.rb | 18 +-- 19 files changed, 248 insertions(+), 62 deletions(-) create mode 100644 lib/utils/pdf_to_text.rb create mode 100644 spec/fixtures/rag/2-page.pdf create mode 100644 spec/lib/utils/pdf_to_text_spec.rb diff --git a/app/controllers/discourse_ai/admin/ai_personas_controller.rb b/app/controllers/discourse_ai/admin/ai_personas_controller.rb index 540ac824..b982677b 100644 --- a/app/controllers/discourse_ai/admin/ai_personas_controller.rb +++ b/app/controllers/discourse_ai/admin/ai_personas_controller.rb @@ -41,7 +41,7 @@ module DiscourseAi tools: tools, llms: llms, settings: { - rag_pdf_images_enabled: SiteSetting.ai_rag_pdf_images_enabled, + rag_images_enabled: SiteSetting.ai_rag_images_enabled, }, }, } diff --git a/app/controllers/discourse_ai/admin/rag_document_fragments_controller.rb b/app/controllers/discourse_ai/admin/rag_document_fragments_controller.rb index d27f4a82..d6ee53b4 100644 --- a/app/controllers/discourse_ai/admin/rag_document_fragments_controller.rb +++ b/app/controllers/discourse_ai/admin/rag_document_fragments_controller.rb @@ -48,8 +48,8 @@ module DiscourseAi def validate_extension!(filename) extension = File.extname(filename)[1..-1] || "" - authorized_extensions = %w[txt md] - authorized_extensions.concat(%w[pdf png jpg jpeg]) if SiteSetting.ai_rag_pdf_images_enabled + authorized_extensions = %w[txt md pdf] + authorized_extensions.concat(%w[png jpg jpeg]) if SiteSetting.ai_rag_images_enabled if !authorized_extensions.include?(extension) raise Discourse::InvalidParameters.new( I18n.t( diff --git a/app/jobs/regular/digest_rag_upload.rb b/app/jobs/regular/digest_rag_upload.rb index 179660d1..4ac45971 100644 --- a/app/jobs/regular/digest_rag_upload.rb +++ b/app/jobs/regular/digest_rag_upload.rb @@ -164,22 +164,16 @@ module ::Jobs end def get_uploaded_file(upload:, target:) - if %w[pdf png jpg jpeg].include?(upload.extension) && !SiteSetting.ai_rag_pdf_images_enabled + if %w[png jpg jpeg].include?(upload.extension) && !SiteSetting.ai_rag_images_enabled raise Discourse::InvalidAccess.new( - "The setting ai_rag_pdf_images_enabled is false, can not index images and pdfs.", + "The setting ai_rag_images_enabled is false, can not index images", ) end if upload.extension == "pdf" - pages = - DiscourseAi::Utils::PdfToImages.new( - upload: upload, - user: Discourse.system_user, - ).uploaded_pages - return( - DiscourseAi::Utils::ImageToText.as_fake_file( - uploads: pages, - llm_model: target.rag_llm_model, + DiscourseAi::Utils::PdfToText.as_fake_file( + upload: upload, + llm_model: SiteSetting.ai_rag_images_enabled ? target.rag_llm_model : nil, user: Discourse.system_user, ) ) diff --git a/app/serializers/ai_custom_tool_list_serializer.rb b/app/serializers/ai_custom_tool_list_serializer.rb index 9f4ad25d..36642a68 100644 --- a/app/serializers/ai_custom_tool_list_serializer.rb +++ b/app/serializers/ai_custom_tool_list_serializer.rb @@ -10,7 +10,7 @@ class AiCustomToolListSerializer < ApplicationSerializer presets: AiTool.presets, llms: DiscourseAi::Configuration::LlmEnumerator.values_for_serialization, settings: { - rag_pdf_images_enabled: SiteSetting.ai_rag_pdf_images_enabled, + rag_images_enabled: SiteSetting.ai_rag_images_enabled, }, } end diff --git a/assets/javascripts/discourse/components/ai-persona-editor.gjs b/assets/javascripts/discourse/components/ai-persona-editor.gjs index 2f4c1bd8..eb69f77d 100644 --- a/assets/javascripts/discourse/components/ai-persona-editor.gjs +++ b/assets/javascripts/discourse/components/ai-persona-editor.gjs @@ -596,13 +596,13 @@ export default class PersonaEditor extends Component { @target={{this.editingModel}} @updateUploads={{this.updateUploads}} @onRemove={{this.removeUpload}} - @allowPdfsAndImages={{@personas.resultSetMeta.settings.rag_pdf_images_enabled}} + @allowImages={{@personas.resultSetMeta.settings.rag_images_enabled}} />
{{/if}} diff --git a/assets/javascripts/discourse/components/rag-options.gjs b/assets/javascripts/discourse/components/rag-options.gjs index 2f38346b..08130642 100644 --- a/assets/javascripts/discourse/components/rag-options.gjs +++ b/assets/javascripts/discourse/components/rag-options.gjs @@ -81,7 +81,7 @@ export default class RagOptions extends Component { }} /> - {{#if @allowPdfsAndImages}} + {{#if @allowImages}}

{{i18n "discourse_ai.rag.uploads.title"}}

- {{#if @allowPdfsAndImages}} -

{{i18n "discourse_ai.rag.uploads.description_with_pdfs"}}

+ {{#if @allowImages}} +

{{i18n "discourse_ai.rag.uploads.description_with_images"}}

{{else}}

{{i18n "discourse_ai.rag.uploads.description"}}

{{/if}} diff --git a/config/locales/client.en.yml b/config/locales/client.en.yml index e738b4bb..96a2d00f 100644 --- a/config/locales/client.en.yml +++ b/config/locales/client.en.yml @@ -280,8 +280,8 @@ en: hide_indexing_options: "Hide upload options" uploads: title: "Uploads" - description: "Plaintext (.txt) or markdown (.md)" - description_with_pdfs: "Plaintext (.txt), markdown (.md), PDF (.pdf) or image (.png, .jpeg)" + description: "PDF (.pdf), Plaintext (.txt) or markdown (.md)" + description_with_images: "Plaintext (.txt), markdown (.md), PDF (.pdf) or image (.png, .jpeg)" button: "Add files" filter: "Filter uploads" indexed: "Indexed" diff --git a/config/settings.yml b/config/settings.yml index 850d63ec..d2e212fb 100644 --- a/config/settings.yml +++ b/config/settings.yml @@ -355,6 +355,6 @@ discourse_ai: hidden: true type: list - ai_rag_pdf_images_enabled: + ai_rag_images_enabled: default: false hidden: true diff --git a/evals/lib/eval.rb b/evals/lib/eval.rb index 3e632854..05b90a15 100644 --- a/evals/lib/eval.rb +++ b/evals/lib/eval.rb @@ -130,22 +130,13 @@ class DiscourseAi::Evals::Eval upload = UploadCreator.new(File.open(path), File.basename(path)).create_for(Discourse.system_user.id) - uploads = - DiscourseAi::Utils::PdfToImages.new( - upload: upload, - user: Discourse.system_user, - ).uploaded_pages - text = +"" - uploads.each do |page_upload| - DiscourseAi::Utils::ImageToText - .new(upload: page_upload, llm_model: llm.llm_model, user: Discourse.system_user) - .extract_text do |chunk, error| - text << chunk if chunk - text << "\n\n" if chunk - end - upload.destroy - end + DiscourseAi::Utils::PdfToText + .new(upload: upload, user: Discourse.system_user, llm_model: llm.llm_model) + .extract_text do |chunk| + text << chunk if chunk + text << "\n\n" if chunk + end text ensure diff --git a/lib/utils/image_to_text.rb b/lib/utils/image_to_text.rb index 43f3c671..d2433feb 100644 --- a/lib/utils/image_to_text.rb +++ b/lib/utils/image_to_text.rb @@ -50,12 +50,27 @@ class DiscourseAi::Utils::ImageToText Reader.new(uploads: uploads, llm_model: llm_model, user: user) end + def self.tesseract_installed? + if defined?(@tesseract_installed) + @tesseract_installed + else + @tesseract_installed = + begin + Discourse::Utils.execute_command("which", "tesseract") + true + rescue Discourse::Utils::CommandError + false + end + end + end + attr_reader :upload, :llm_model, :user - def initialize(upload:, llm_model:, user:) + def initialize(upload:, llm_model:, user:, guidance_text: nil) @upload = upload @llm_model = llm_model @user = user + @guidance_text = guidance_text end def extract_text(retries: 3) @@ -104,7 +119,8 @@ class DiscourseAi::Utils::ImageToText end def extract_text_from_page(page) - raw_text = extract_text_with_tesseract(page) + raw_text = @guidance_text + raw_text ||= extract_text_with_tesseract(page) if self.class.tesseract_installed? llm = llm_model.to_llm if raw_text.present? @@ -112,7 +128,7 @@ class DiscourseAi::Utils::ImageToText { type: :user, content: - "The following text was extracted from an image using OCR. Please enhance, correct, and structure this content while maintaining the original meaning:\n\n#{raw_text}", + "The following text was extracted from an image using OCR. Please enhance, correct, and structure this content while maintaining the original text:\n\n#{raw_text}", upload_ids: [page.id], }, ] @@ -127,6 +143,8 @@ class DiscourseAi::Utils::ImageToText end def extract_text_with_tesseract(page) + # return nil if we can not find tessaract binary + return nil if !self.class.tesseract_installed? upload_path = if page.local? Discourse.store.path_for(page) diff --git a/lib/utils/pdf_to_images.rb b/lib/utils/pdf_to_images.rb index b9865344..0411ab7a 100644 --- a/lib/utils/pdf_to_images.rb +++ b/lib/utils/pdf_to_images.rb @@ -19,8 +19,6 @@ class DiscourseAi::Utils::PdfToImages end def extract_pages - Dir.mktmpdir("discourse-pdf-#{SecureRandom.hex(8)}") - begin pdf_path = if upload.local? @@ -31,6 +29,7 @@ class DiscourseAi::Utils::PdfToImages raise Discourse::InvalidParameters.new("Failed to download PDF") if pdf_path.nil? + temp_dir = Dir.mktmpdir("discourse-pdf-#{SecureRandom.hex(8)}") temp_pdf = File.join(temp_dir, "source.pdf") FileUtils.cp(pdf_path, temp_pdf) @@ -74,7 +73,7 @@ class DiscourseAi::Utils::PdfToImages @uploaded_pages = uploads ensure - FileUtils.rm_rf(temp_dir) + FileUtils.rm_rf(temp_dir) if temp_dir end end end diff --git a/lib/utils/pdf_to_text.rb b/lib/utils/pdf_to_text.rb new file mode 100644 index 00000000..9acd2b89 --- /dev/null +++ b/lib/utils/pdf_to_text.rb @@ -0,0 +1,112 @@ +# frozen_string_literal: true + +class DiscourseAi::Utils::PdfToText + MAX_PDF_SIZE = 100.megabytes + + class Reader + def initialize(upload:, user: nil, llm_model: nil) + @extractor = + DiscourseAi::Utils::PdfToText.new(upload: upload, user: user, llm_model: llm_model) + @enumerator = create_enumerator + @buffer = +"" + end + + def read(length) + return @buffer.slice!(0, length) if !@buffer.empty? + + begin + @buffer << @enumerator.next + rescue StopIteration + return nil + end + + @buffer.slice!(0, length) + end + + private + + def create_enumerator + Enumerator.new { |yielder| @extractor.extract_text { |chunk| yielder.yield(chunk || "") } } + end + end + + attr_reader :upload + + def self.as_fake_file(upload:, user: nil, llm_model: nil) + Reader.new(upload: upload, user: user, llm_model: llm_model) + end + + def initialize(upload:, user: nil, llm_model: nil) + @upload = upload + @user = user + @llm_model = llm_model + end + + def extract_text + pdf_path = + if upload.local? + Discourse.store.path_for(upload) + else + Discourse.store.download_safe(upload, max_file_size_kb: MAX_PDF_SIZE)&.path + end + + raise Discourse::InvalidParameters.new("Failed to download PDF") if pdf_path.nil? + + require "pdf/reader" + + page_number = 0 + PDF::Reader.open(pdf_path) do |reader| + reader.pages.each do |page| + page_number += 1 + llm_decorate(page_number: page_number, text: page.text, pdf_path: pdf_path) do |chunk| + yield chunk + end + end + end + end + + def llm_decorate(page_number:, text:, pdf_path:) + raise "Must be called with block" if !block_given? + if !@llm_model + yield text + return + end + + begin + temp_dir = Dir.mktmpdir("discourse-pdf-#{SecureRandom.hex(8)}") + output_path = File.join(temp_dir, "page-#{page_number}.png") + + # Extract specific page using ImageMagick + # image magick uses 0 based page numbers + command = [ + "magick", + "-density", + "300", + "#{pdf_path}[#{page_number - 1}]", + "-background", + "white", + "-auto-orient", + "-quality", + "85", + output_path, + ] + + Discourse::Utils.execute_command( + *command, + failure_message: "Failed to convert PDF page #{page_number} to image", + timeout: 30, + ) + + # TODO - we are creating leftover uploads, they will be cleaned up + # but maybe we should just keep them around? + upload = + UploadCreator.new(File.open(output_path), "page-#{page_number}.png").create_for(@user&.id) + + DiscourseAi::Utils::ImageToText + .new(upload: upload, llm_model: @llm_model, user: @user, guidance_text: text) + .extract_text { |chunk| yield chunk } + ensure + FileUtils.rm_rf(temp_dir) if temp_dir + end + end +end diff --git a/plugin.rb b/plugin.rb index 36c9bca3..4b4726eb 100644 --- a/plugin.rb +++ b/plugin.rb @@ -12,6 +12,16 @@ gem "tokenizers", "0.4.4" gem "tiktoken_ruby", "0.0.9" gem "ed25519", "1.2.4" #TODO remove this as existing ssl gem should handle this +# we probably want to move all dependencies directly in to the Discourse Gemfile, this +# will give us a strong guarantee that the dependencies are compatible and keep getting upgraded +gem "Ascii85", "2.0.1", require: false +gem "ruby-rc4", "0.1.5", require: false +gem "hashery", "2.1.2", require: false +gem "ttfunk", "1.8.0", require: false +gem "afm", "0.2.2", require: false +# all above are required by pdf-reader +gem "pdf-reader", "2.14.1", require: false + enabled_site_setting :discourse_ai_enabled register_asset "stylesheets/common/streaming.scss" diff --git a/spec/fixtures/rag/2-page.pdf b/spec/fixtures/rag/2-page.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e12bec20cc449b5d488a024bdb14cbfdfc7e12a7 GIT binary patch literal 13529 zcmeHO2{csy+m}Qpp(5F*vSyjZ%rMp%24Tp)YmC8QW;8Q0)X#6A4rgqP8}O&Y%(T zWDqk`DZUj{2NlqtnNjdmO6wJ!KgeIgr<)V!e%oO0?jk|eHJI8Lsuy_f`L+4mbNA)W z;KSMm$p#%A)9g`#8SJNGchGfRmr%WL%4+qV6e?c}Lz*Vs%I6TRi4 z(Vr#MB=59;y{r#=z>7Fu$>%vz_x~nTRuI323WNO?D~wg?zpLxFy2HR~YHEm;MSiVD zws7i99!i`I&WlR%iL0WZd$6+!*gwk(HGdFH1?^S54*#dEc=* zYEbG))?$Uw`>^5p%#gN9yWm~2vU(P5R^Z6({K=bMcZX`REoiaH^O~L~`MTW|U+cDt ze+iF(ZPuE(xmuFlTbX@CIU!zBd~qf|!{P>*=NO%;CdKvTq5o4(i=&ez(nk+IpE3+! zD{SF+lT0{29%gq&FqUd~gQh8QZ2ZY-{KD#3A;Z@po0QVU*$22ANTvygmOixeMa5>X zYBrZ1C}8m86xr+%N3)F%ZM->*b;X2g9f*bR{E!&W&OzUz94kUj;9Yvb`%YdiLYb_v z1zg)HhzaFw=~uj67s69r+BT!SF!2Qr3i~v{5xPmaPM#^sdyz+#31;79uoxL@A zJ>OW=_|BnC_q4gLG(TvW>qqq5Rf^$~h!LKh06`>T>lU}P2KJN7xWgkaZH(cIl~~&7 ziYb%?lV>iOavNMfO7mw3e-!$V>%Vay_bt6wii2zS)vncvoO;FYh#S;4S9-^sZ#t^Y z!}n614WFi;#z%f@s1ws0{bHb4;(*JCHP5&QTM@2X&EtaXZ|O2c=%07wcL|-UQl6mn z8|YqNlH2AM0n3Ef-dva8kd=KbM<*l}V`1deu%r2=pxrZ`LvuQy_;r#;-frbPw7FR6 z@#f&o;R-ENu=^#R@#QvCpY!LM=Z?r6=`Q*3c5Z$+s#~j@>V8d`O-+4fd@3l0lk4+L z@aIos@85NGJnwk3FW{<1pnCVdtCU-VpHj$96$q+NNoQvVrPspcnyaJNwWLcZ;q?a5 zb3v8CE(x~VK^#5>dJD=1MQ3Fy6f31F3yV{#TI$3Dt|{MrXLrF=Fx2~EUa>Ongh%w} z0?T;5AmOgdY8JF<8Dm4+$E8xnM;RJ^UdmDz7p`K?Uh#P03aQIimy)z8rZH7xkR4+3 z`VBq?>m>x^QmO=IJk$m!2}eEDbuPR3@V)NnsAITlN6xL47tL!xbX||~-}!8$D$J2B zY%4uZBjB#?ohzMhN1;xaYO1;^c}Alz6}^XLk~8-iG?_hKyG|VU_)T}^y;RS5=s<-a zN8Xvx+l8_4QBQq4nW+-@GI^wBj3xcWGka4Iub?Lmc@B`V> zRZk-C*77)AQd}?VB@T^@gM@Nfko~4>pobnj`Qx_+j4|Go1`ljM!$Gm=UH!I%1Bk}SsUeg=TXD~jIF{@I;G^? zp|Pe&19c`$usX@Y)}$?Ow5-JW_$r9q?vA96cc9tT7gO8=C+G$)!ut(T9G)iKZ^O9L zHy4{y&kvoFdB#)2E92aq;9~o(g&B43$Q>&0K&5D&>lK^5vn6%fcSR>l%jXXegDw3%v%jX`6mUfA6Cd$1xOD8rAL*_wN566batl6b5#QdD@S@IG4U_@D^SAJO7D#T;N>#n}iY;xx zdA+Q^Ce-A>lj%4FdxXPQ6BSQ!8@eGc|FLs-X<~7YMRMO z8FNPxGX3^cCq%8W9VHp zD{jq4gbt@gZX~;&&S|jNeL7=s=Jn9#%|dbEZl!yl*(nYQgVxvA)D1ild}Y7pupNKM z8kqI5K!xT!&?F*%+U6Jv=k{`16}>ktj@;VfGSK31f4H(6R!ry@Sj>qE6>B_hs*F)f zSl#U>5JZ{vzdc#P+6CC^jph zxO%Xi6MiE?4aLvFD=~z}2xpt!QTo)6;R>4dQ`qu_?6csvIzCTYae6)avHFuC@!WU> zwK(OeaBTaH=#e8vcYS($Z6~kL-A<)tyCr8Xna!w}XO8v?irH7Zd&^NX`|!h?gq`PR zN8LjEKTk|nVT>*JjVxuI9ojv2AZ#H2acCYkMDD!(him2><2Rf_od}24Gb=TktrDUg zs~5{V(1IFIDN*s!(H2Ui`@Rac!|xeK?n}HNqnu7ZbygxZXu=ZHN5my0k?A5@+E3Es zaFa(*hISaA&y;#6r7J7UIKeC1H74~o%j4Qf@61<6PA;50c@NDE(rJyBbe7Ld=8&Cx zXdHegdjC~RwOLq(Qma9>i-6;Mrzu#3bSkehCQBbmtLV&32vBUPU?zcHJQl9WJ9u(7 zD>q`Dq%(?FQbIeFAe9=A;o>V6&|Kh-^sy?z-Nu!B_1rEm+RC*0P}8{CwL;gq9A2=~>0FpApoRs8%T#@_P=gN$)y#gR ziqv>t426oLbr8a23Hl}VhYNMZ6%SmXukMh&g0)4cdy5iPd%LEB@BAB`an{Y6goVtP;Xrb0n76G+30Cmx+CNnB>$l@h~Y>NI)nx_ya}$;row*No_V8 z^G;}58Mj%VDA;x4g4EKkfdVND<9#pdcN>fB;Jzf)xQiCwn)A?IkD43mfNO21H;%{L zS$Eb0DYGHbg8M-Zx0RupGzZ<|xw9GKQd47j=(U`F(ZHpFkE+VYn->QA)q-BJ1y!rj zpIT+!x^MYK;?dGl)ZC+R+EU#W6|XLcsL%vdymMc_=K8ZB4YIaF zxUQZ>z24L+wuS@LoXkFHx@tm*U;oPeQf7|Lg1|Xq?PSQAU2&au$;Lg-9a6bQ69F0y z$SQGH zmL@$XWHM*ypc$=(IhNHL=n~p0B;d-N?%BT>rJAt6zoEO#>+qiWzKV@H;pD179kwr8 zMshkKB{vzJCHVRA+b0&42FBIu#11^3I57AAFWfL zL}q)-U&m;!aS>sIXlt2@_s5NnZ)6jmHJZ)*z_YYy*<0-RXt3u-TG(#R?KKsWJ72~O zyx11MSHaltZfBNBIQOo;f;#q*J=W0%3uCbtOE`(R3niOD{oI0Wt5%0%q67RTv8mA? zOVZm5{gnkay%J#UkoNv5#e(r~M=sr5tm%Z>{qlhoz8FuTz~eDR6JN26pg zgyYugN&4wUj5}#ZQzXORxRlC3Y+Sq79h_v^Dko+h1AAO8d*Zk?bmN{r z%a10a_9^iZ3~#u8VCx%(oM0mQwZP}PX3p)U%!?@!+=lG z^_m7JX0`lu&P_;hikQZ{O)=0GNzB`g#J%3bubR+xgiGdZ5iV_L+TEW97B+NscgCGd z_P(f{iIb&N^dyR*Eg^+@+dHp(@jH`E_HSgEVO=tuRUsrd z_Nl?fmlt+emr*&7ZyciK+$}EB-uC&cp%zK&CPwp)Z5psY=0q%y zv2Z_N5_DarRX;r>Zf`-Y6LSoccXA?lYrVM*eu#7+Prv2ia#B1Lp^YJ-+ErYGLL3z}cDXFORh zOANSy@{`>vAs^)tK51w{$eeF$+&uJr>s0rlcE|iV-px~j)`2Z&asIo+uQT0BrX(!s z!9snTZdP1PyZ>^s&jTquJaImyD`CPr$|U5TXNGZn;ps9BiQ$bpJs89# z^K##UJE>{pnVZ&l{p)<0GpA1lBR%K8I40gnektr6u6@F!R3Fyp`f<7`1^2w6Y^vJ+ z@Twb8!E<59E#{jpc_#FEL}aCBjNda`XmhTN^Hat0n@=$Ai+tY@c}$>NcBUH@3N6A;pfP)%3-^ zQsGDYBep2pE3Z8*yJ!ybYTHyF)EYD&G|}4?+}hQ)t93AuT;d!w)=j7ztLwc-h?bdA z$W5HqQyB7paP<0~pl~nluJ|7KOP^X&Q&)^i!W04*I;^{Q9Zus-fZwK=FGpVKxQ91< zYFuvd$@?A_Hcc0AS2P5 z%$Sc6Pr(ax-@05|cKJ$U*~J2;QJm=0GiP+K#**E$w+NZXrRrzZys#p5#Z4UfNcC)7r(qifi?GerR!)@|t`|@~)m)FiNy;)AdMl-Ju3dfudl_>>iWj0Mz6V$8t%kqEjJFFzs}3WDg;2}BA5 ztcHexAS`378zndl1%hDk{zgQSmp22D10Gs2h-6!^8X&?_4`A9vICvQwwFCi3=XYQj zhWHIFg(Fq|2MFBd*aM#iFn{B?c68G_{G;plhc+H8BW{gZ${|VK9+46h5hU;i&5wHY z=<&5s)5b*?54lt|wRJVPp6k?-Qg`&)rWhVCGQxH;Aa`TCeHKNqFW5GsWb9pqbZu|t zqY5QFwX!1j`lI@&iPgOuK5bmD|5he=_WE??`%eCvJ)Eg>vX+*s38$xW>#n3nHBUR11mCS!)Fs8XnC*0IlO|ubGroOl|4q+8o^wmqFonXRS2u>> zS9WUKMVo7=Mvn&;uj_1P|LFB<(`ZK4nwp>tS69f}Ib(y5A_Uw)jRraCg8?6^hhAd# z_Rn$lr-v+8{};R+hWwGjzxn`T72wWbHH9@TfVfQ6!_$eZO8ocjK%&tZ7;ij{Rc2HC zx3ww=Vn^~|c+;JrU?dU+MgSdxghH3Ea3nwj)~wUzYn^}eA9%SxEkO`#sttw2nqq*L zi~V!?Q9pdg!obU+{LJuUa`4pv7{(F-)LPJ}1S=xL2?7jOAl5`C!{w*G`k{9@dJDW4 z5n_R-vDy^|Wodv|66w?c8i7a$!&pWvA!bAm5}x%Mu%4rUmrfN8^uIOUoer^L^)oGi z!1$_y&PovmNG70;4qit0SM>CXnEn;pz{~ZQ?Pbqb*oJ){NqpZF%!mxU2cCidt0C;o zOuvtt5NK88kEw&E4$~h`@Fg<9?nEyVMQhu5^{s7Sl84qdJER%Z%pXJaCK(0Oh*rTk zYeKLu0qwC(M;oNcRA-X?$$%9wlk7*Kt24E>;enwKQ5{&b$dGN?Ah0IQ!&BW-AN!35 zxYOF^&0zSeLm+{Hfy#j@%2b*c1cpYVAy7C34p#y=l;}Ye2A-)zp>JoY_^RW3e*wD~ zn57f%P7PpaZQJ%uvHuFi6uR=VBgzCS83H_jz?4;%odHw-niD?^o4z;gC#C^r;B9t;7me2$@#$pI7+BS@2_@`nz;Z znD%|+`axsG0bx15B9Q+yI3Uk|W$mZnn0S96IOzA_06ujLFf9cB30PC9e%f|45-?BH zg#1ch_W3U&Q6CsLYQv#$q!JXa1VdTFVCqn`I#fl0wScY=|H}E7v>DZdi#9cKNb0x68n$U{{N}S z4`>a*y|uOlvVgO{!TEm$a=${327W{`06zi%B%b1dr+EO-CzyqrvoKIEFzegKz)BYk z0Dk5GY#RjNVGs*HJb_3Cz)6UnAC=(C0!ASw0Ic~FM2ZF^Swn^INa25C?Hl&`53H@g z`IkY_pRMUr2>~pBS+T)bj{Ovzi4!X`mSFJm6Y$SrEVKU}hc%5zw4_p58kbRD);$=$ zjQoNimQ+8Y8Qvd={CiR%7T=PzoY`MYEQ@_JfeRqhoj_2q6Zor7E+81N{_2p#}h0HGG9a_=<0qFXs>g;JSWPFCaod z@i$nT6|WZ&K-!298#>WEfZ<1?u&kQ<6Dhg`*7V0|xn;l*dk04_CBV<`>jxAt`L!nB zGxt{^Oz{*iIU+^L#!4OpF~ie*jVYc~Aa|@K(E|+N;;bJi0Py)&q#3-7L0hhLe8n#v zz5$b(SOY8;3Pq|yp{j5w6om$^2;d5g5>(;ZEO37roBhUyz0)qp8zt{!hU~LmW`2vHh{H6~Eq-q5pgN6qt zqcqkgY()wNb|@8KbD{$8{qjo+g0NmSFw0Taj%ch8Y&Dv?XqYZiPhTGmbPE)zud1d7 z*9Dpwj>W*BFg-XN0KWf67S;v>?70|kBEgp)Kn6opQ80vxJ`9F1Kw7aQ~Zwr literal 0 HcmV?d00001 diff --git a/spec/jobs/regular/digest_rag_upload_spec.rb b/spec/jobs/regular/digest_rag_upload_spec.rb index eec30ac6..d3b1ed58 100644 --- a/spec/jobs/regular/digest_rag_upload_spec.rb +++ b/spec/jobs/regular/digest_rag_upload_spec.rb @@ -3,7 +3,7 @@ RSpec.describe Jobs::DigestRagUpload do fab!(:persona) { Fabricate(:ai_persona) } fab!(:upload) { Fabricate(:upload, extension: "txt") } - fab!(:pdf_upload) { Fabricate(:upload, extension: "pdf") } + fab!(:image_upload) { Fabricate(:upload, extension: "png") } let(:document_file) { StringIO.new("some text" * 200) } fab!(:cloudflare_embedding_def) @@ -31,13 +31,13 @@ RSpec.describe Jobs::DigestRagUpload do end describe "#execute" do - context "when processing a PDF upload" do + context "when processing an image upload" do it "will reject the indexing if the site setting is not enabled" do - SiteSetting.ai_rag_pdf_images_enabled = false + SiteSetting.ai_rag_images_enabled = false expect { described_class.new.execute( - upload_id: pdf_upload.id, + upload_id: image_upload.id, target_id: persona.id, target_type: persona.class.to_s, ) diff --git a/spec/lib/utils/pdf_to_text_spec.rb b/spec/lib/utils/pdf_to_text_spec.rb new file mode 100644 index 00000000..ccde06ef --- /dev/null +++ b/spec/lib/utils/pdf_to_text_spec.rb @@ -0,0 +1,62 @@ +# frozen_string_literal: true + +RSpec.describe DiscourseAi::Utils::PdfToText do + fab!(:llm_model) + fab!(:user) + let(:pdf) { plugin_file_from_fixtures("2-page.pdf", "rag") } + let(:upload) { UploadCreator.new(pdf, "2-page.pdf").create_for(Discourse.system_user.id) } + before { SiteSetting.authorized_extensions = "pdf|png|jpg|jpeg" } + + describe "#extract_text" do + it "extracts text from PDF pages" do + pdf_to_text = described_class.new(upload: upload) + pages = [] + pdf_to_text.extract_text { |page| pages << page } + + expect(pages).to eq(["Page 1", "Page 2"]) + end + end + + context "when improving PDF extraction with LLM" do + it "can properly simulate a file" do + if ENV["CI"] + skip "This test requires imagemagick is installed with ghostscript support - which is not available in CI" + end + + responses = [ + "Page 1: LLM chunk 1Page 1: LLM chunk 2", + "Page 2: LLM chunk 3", + ] + + pages = [] + DiscourseAi::Completions::Llm.with_prepared_responses(responses) do |_, _, _prompts| + file = described_class.as_fake_file(upload: upload, user: user, llm_model: llm_model) + + while content = file.read(100_000) + pages << content + end + end + + expect(pages).to eq(["Page 1: LLM chunk 1", "Page 1: LLM chunk 2", "Page 2: LLM chunk 3"]) + end + + it "works as expected" do + if ENV["CI"] + skip "This test requires imagemagick is installed with ghostscript support - which is not available in CI" + end + pdf_to_text = described_class.new(upload: upload, user: user, llm_model: llm_model) + pages = [] + + responses = [ + "Page 1: LLM chunk 1Page 1: LLM chunk 2", + "Page 2: LLM chunk 3", + ] + + DiscourseAi::Completions::Llm.with_prepared_responses(responses) do |_, _, _prompts| + pdf_to_text.extract_text { |page| pages << page } + end + + expect(pages).to eq(["Page 1: LLM chunk 1", "Page 1: LLM chunk 2", "Page 2: LLM chunk 3"]) + end + end +end diff --git a/spec/requests/admin/rag_document_fragments_controller_spec.rb b/spec/requests/admin/rag_document_fragments_controller_spec.rb index 7a906aaf..24b4b387 100644 --- a/spec/requests/admin/rag_document_fragments_controller_spec.rb +++ b/spec/requests/admin/rag_document_fragments_controller_spec.rb @@ -24,10 +24,10 @@ RSpec.describe DiscourseAi::Admin::RagDocumentFragmentsController do end describe "POST #upload_file" do - let :fake_pdf do + let :fake_image do @cleanup_files ||= [] - tempfile = Tempfile.new(%w[test .pdf]) - tempfile.write("fake pdf") + tempfile = Tempfile.new(%w[test .png]) + tempfile.write("fake image") tempfile.rewind @cleanup_files << tempfile tempfile @@ -46,26 +46,26 @@ RSpec.describe DiscourseAi::Admin::RagDocumentFragmentsController do end it "rejects PDF files if site setting is not enabled" do - SiteSetting.ai_rag_pdf_images_enabled = false + SiteSetting.ai_rag_images_enabled = false post "/admin/plugins/discourse-ai/rag-document-fragments/files/upload.json", params: { - file: Rack::Test::UploadedFile.new(fake_pdf), + file: Rack::Test::UploadedFile.new(fake_image), } expect(response.status).to eq(400) end - it "allows PDF files if site setting is enabled" do - SiteSetting.ai_rag_pdf_images_enabled = true + it "allows image files if site setting is enabled" do + SiteSetting.ai_rag_images_enabled = true post "/admin/plugins/discourse-ai/rag-document-fragments/files/upload.json", params: { - file: Rack::Test::UploadedFile.new(fake_pdf), + file: Rack::Test::UploadedFile.new(fake_image), } upload = Upload.last - expect(upload.original_filename).to end_with(".pdf") + expect(upload.original_filename).to end_with(".png") end end end