From c70f43f130fb98e4a1adb1afd9b621aaa49088e3 Mon Sep 17 00:00:00 2001 From: Rafael dos Santos Silva Date: Wed, 17 Jan 2024 15:17:58 -0300 Subject: [PATCH] FIX: Truncate content for sentiment/toxicity classification (#431) --- lib/sentiment/sentiment_classification.rb | 13 ++++++++----- lib/toxicity/toxicity_classification.rb | 17 +++++++++++------ 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/lib/sentiment/sentiment_classification.rb b/lib/sentiment/sentiment_classification.rb index dc5b2e87..1d36e6a1 100644 --- a/lib/sentiment/sentiment_classification.rb +++ b/lib/sentiment/sentiment_classification.rb @@ -48,11 +48,14 @@ module DiscourseAi end def content_of(target_to_classify) - if target_to_classify.post_number == 1 - "#{target_to_classify.topic.title}\n#{target_to_classify.raw}" - else - target_to_classify.raw - end + content = + if target_to_classify.post_number == 1 + "#{target_to_classify.topic.title}\n#{target_to_classify.raw}" + else + target_to_classify.raw + end + + Tokenizer::BertTokenizer.truncate(content, 512) end def endpoint diff --git a/lib/toxicity/toxicity_classification.rb b/lib/toxicity/toxicity_classification.rb index 8bb5f788..c178d2e1 100644 --- a/lib/toxicity/toxicity_classification.rb +++ b/lib/toxicity/toxicity_classification.rb @@ -59,13 +59,18 @@ module DiscourseAi end def content_of(target_to_classify) - return target_to_classify.message if target_to_classify.is_a?(Chat::Message) + content = + if target_to_classify.is_a?(Chat::Message) + target_to_classify.message + else + if target_to_classify.post_number == 1 + "#{target_to_classify.topic.title}\n#{target_to_classify.raw}" + else + target_to_classify.raw + end + end - if target_to_classify.post_number == 1 - "#{target_to_classify.topic.title}\n#{target_to_classify.raw}" - else - target_to_classify.raw - end + Tokenizer::BertTokenizer.truncate(content, 512) end def endpoint