diff --git a/app/controllers/admin/communication/websites/posts/curations_controller.rb b/app/controllers/admin/communication/websites/posts/curations_controller.rb index 111b201ca983bcdff52ddcaab87b9036a5123c46..436b60dfc08bd03ffd7efda246e2cdb059c3aa79 100644 --- a/app/controllers/admin/communication/websites/posts/curations_controller.rb +++ b/app/controllers/admin/communication/websites/posts/curations_controller.rb @@ -4,7 +4,7 @@ class Admin::Communication::Websites::Posts::CurationsController < Admin::Commun end def create - @curator = Curator.new @website, current_user, current_website_language, curation_params[:url] + @curator = Importers::Curator.new @website, current_user, current_website_language, curation_params[:url] if @curator.valid? redirect_to [:admin, @curator.post], notice: t('admin.successfully_created_html', model: @curator.post.to_s) diff --git a/app/services/curator.rb b/app/services/curator.rb deleted file mode 100644 index 4f7c5042800995657d8cb46f6614626fb6e8a485..0000000000000000000000000000000000000000 --- a/app/services/curator.rb +++ /dev/null @@ -1,54 +0,0 @@ -class Curator - attr_reader :website, :user, :language, :url, :post - - def initialize(website, user, language, url) - @website = website - @user = user - @language = language - @url = url - create_post! - attach_image! unless page.image.blank? - rescue - end - - def valid? - @post.valid? - end - - protected - - def create_post! - @post = website.posts.create( - university: website.university, - title: page.title, - slug: page.title.parameterize, - author: @user.person, - published_at: Time.now, - language_id: @language.id - ) - @chapter = @post.blocks.create( - university: website.university, - template_kind: :chapter, - published: true, - position: 0 - ) - text = Wordpress.clean_html("#{page.text}<p><a href=\"#{@url}\" target=\"_blank\">Source</a></p>") - data = @chapter.data.deep_dup - data['text'] = text - @chapter.data = data - @chapter.save - end - - def attach_image! - @post.featured_image.attach( - io: URI.open(page.image), - filename: File.basename(page.image).split('?').first - ) - rescue - puts "Attach image failed" - end - - def page - @page ||= Curation::Page.new(@url) - end -end diff --git a/app/services/download_service.rb b/app/services/download_service.rb deleted file mode 100644 index e0469ccfce588827aca48a6ea842a22bc1953497..0000000000000000000000000000000000000000 --- a/app/services/download_service.rb +++ /dev/null @@ -1,40 +0,0 @@ -class DownloadService - attr_reader :response - - def self.download(url) - new(url) - end - - def initialize(url) - @url = url - process! - end - - def attachable_data - { io: io, filename: filename, content_type: content_type } - end - - def io - @io ||= StringIO.new(@response.body) - end - - def filename - @filename ||= File.basename(@url) - end - - def content_type - @content_type ||= @response['Content-Type'] - end - - protected - - def process! - uri = URI(@url) - http = Net::HTTP.new(uri.host, uri.port) - http.use_ssl = true - http.verify_mode = OpenSSL::SSL::VERIFY_NONE - - request = Net::HTTP::Get.new(uri.request_uri) - @response = http.request(request) - end -end \ No newline at end of file diff --git a/app/services/importers/api/osuny/communication/website/page.rb b/app/services/importers/api/osuny/communication/website/page.rb index 38f6b82866baa6839e66c1aca90429ec5df73865..48b139e1db2a3176f1cffa9842074320bd0b0580 100644 --- a/app/services/importers/api/osuny/communication/website/page.rb +++ b/app/services/importers/api/osuny/communication/website/page.rb @@ -3,10 +3,15 @@ class Importers::Api::Osuny::Communication::Website::Page < Importers::Api::Osun protected def import + import_params + import_blocks + end + + def import_params + object.title = Importers::Cleaner.clean_string params[:title] + object.summary = Importers::Cleaner.html_to_string params[:summary] object.parent = parent - object.update page_params object.save - import_blocks end def home_page diff --git a/app/services/importers/api/osuny/communication/website/post.rb b/app/services/importers/api/osuny/communication/website/post.rb index 71d924ecb0d73171cc67739f0b8ffbc9f6f62b05..971ba20d1ed6d1faac822989ece0f10edd3da2c9 100644 --- a/app/services/importers/api/osuny/communication/website/post.rb +++ b/app/services/importers/api/osuny/communication/website/post.rb @@ -3,8 +3,7 @@ class Importers::Api::Osuny::Communication::Website::Post < Importers::Api::Osun protected def import - object.update post_params - object.save + import_params import_blocks import_categories end @@ -18,6 +17,14 @@ class Importers::Api::Osuny::Communication::Website::Post < Importers::Api::Osun ).first_or_initialize end + def import_params + object.title = Importers::Cleaner.clean_string params[:title] + object.summary = Importers::Cleaner.html_to_string params[:summary] + object.published_at = params[:published_at] + object.created_at = object.published_at + object.save + end + def import_categories categories.each do |c| category = find_or_create_category c @@ -41,10 +48,4 @@ class Importers::Api::Osuny::Communication::Website::Post < Importers::Api::Osun return [] unless params.has_key?(:categories) @categories ||= params[:categories] end - - def post_params - ActionController::Parameters.new({ post: params }) - .require(:post) - .permit(:title, :language, :meta_description, :summary) - end end \ No newline at end of file diff --git a/app/services/importers/cleaner.rb b/app/services/importers/cleaner.rb new file mode 100644 index 0000000000000000000000000000000000000000..ac6cf93f2d8dcd355cf3e7fbc41d31a2b7a3b9b2 --- /dev/null +++ b/app/services/importers/cleaner.rb @@ -0,0 +1,63 @@ +module Importers + class Cleaner + + def self.html_to_string(html) + h = html + h = Importers::Cleaner.clean_html h + h = ActionController::Base.helpers.strip_tags h + h + end + + def self.clean_string(string) + string = string.gsub(' ', ' ') + string = string.gsub('&', '&') + string = ActionView::Base.full_sanitizer.sanitize string + string = remove_control_chars string + string + end + + def self.clean_html(html) + # invalid byte sequence in UTF-8 + # https://stackoverflow.com/questions/32826781/invalid-byte-sequence-in-utf-8-when-sanitizing-wordpress-export-content + html = html.force_encoding('UTF-8').scrub + # Relaxed config : https://github.com/rgrove/sanitize/blob/main/lib/sanitize/config/relaxed.rb + # iframe attributes from MDN : https://developer.mozilla.org/fr/docs/Web/HTML/Element/iframe + fragment = Sanitize.fragment(html, Sanitize::Config.merge(Sanitize::Config::RELAXED, + attributes: Sanitize::Config::RELAXED[:attributes].merge({ + all: Sanitize::Config::RELAXED[:attributes][:all].dup - ['class', 'style'], + 'a' => Sanitize::Config::RELAXED[:attributes]['a'].dup.delete('rel'), + 'iframe' => [ + 'allow', 'allowfullscreen', 'allowpaymentrequest', 'csp', 'height', 'loading', + 'name', 'referrerpolicy', 'sandbox', 'src', 'srcdoc', 'width', 'align', + 'frameborder', 'longdesc', 'marginheight', 'marginwidth', 'scrolling' + ] + }), + elements: Set.new(Sanitize::Config::RELAXED[:elements]) - ['div', 'style'] + ['iframe'], + remove_contents: ['math', 'noembed', 'noframes', 'noscript', 'plaintext', 'script', 'style', 'svg', 'xmp'], + whitespace_elements: { + 'div' => { :before => "", :after => "" } + } + )) + fragment = Nokogiri::HTML5.fragment(fragment) + if fragment.css('h1').any? + # h1 => h2 ; h2 => h3 ; ... + (1..5).to_a.reverse.each do |i| + fragment.css("h#{i}").each { |element| element.name = "h#{i+1}" } + end + end + html = fragment.to_html(preserve_newline: true) + html = remove_control_chars html + html + end + + protected + + def self.remove_control_chars(string) + # Control chars & LSEP are invisible or hard to detect + string = string.delete("
", "
", "
", "Â’") + string = string.gsub /\u2028/, '' + string + end + + end +end \ No newline at end of file diff --git a/app/services/importers/curator.rb b/app/services/importers/curator.rb new file mode 100644 index 0000000000000000000000000000000000000000..5e2631caaf1149a031489739d8fb8056c5acc6eb --- /dev/null +++ b/app/services/importers/curator.rb @@ -0,0 +1,56 @@ +module Importers + class Curator + attr_reader :website, :user, :language, :url, :post + + def initialize(website, user, language, url) + @website = website + @user = user + @language = language + @url = url + create_post! + attach_image! unless page.image.blank? + rescue + end + + def valid? + @post.valid? + end + + protected + + def create_post! + @post = website.posts.create( + university: website.university, + title: page.title, + slug: page.title.parameterize, + author: @user.person, + published_at: Time.now, + language_id: @language.id + ) + @chapter = @post.blocks.create( + university: website.university, + template_kind: :chapter, + published: true, + position: 0 + ) + text = Importers::Cleaner.clean_html("#{page.text}<p><a href=\"#{@url}\" target=\"_blank\">Source</a></p>") + data = @chapter.data.deep_dup + data['text'] = text + @chapter.data = data + @chapter.save + end + + def attach_image! + @post.featured_image.attach( + io: URI.open(page.image), + filename: File.basename(page.image).split('?').first + ) + rescue + puts "Attach image failed" + end + + def page + @page ||= Curation::Page.new(@url) + end + end +end \ No newline at end of file diff --git a/app/services/wordpress.rb b/app/services/wordpress.rb deleted file mode 100644 index 2b0240d70944d54929ccfa45af597cc0c8cfe246..0000000000000000000000000000000000000000 --- a/app/services/wordpress.rb +++ /dev/null @@ -1,100 +0,0 @@ -class Wordpress - attr_reader :url - - def self.clean_string(string) - string = string.gsub(' ', ' ') - string = string.gsub('&', '&') - string = ActionView::Base.full_sanitizer.sanitize string - string = remove_control_chars string - string - end - - def self.clean_html(html) - # invalid byte sequence in UTF-8 - # https://stackoverflow.com/questions/32826781/invalid-byte-sequence-in-utf-8-when-sanitizing-wordpress-export-content - html = html.force_encoding('UTF-8').scrub - # Relaxed config : https://github.com/rgrove/sanitize/blob/main/lib/sanitize/config/relaxed.rb - # iframe attributes from MDN : https://developer.mozilla.org/fr/docs/Web/HTML/Element/iframe - fragment = Sanitize.fragment(html, Sanitize::Config.merge(Sanitize::Config::RELAXED, - attributes: Sanitize::Config::RELAXED[:attributes].merge({ - all: Sanitize::Config::RELAXED[:attributes][:all].dup - ['class', 'style'], - 'a' => Sanitize::Config::RELAXED[:attributes]['a'].dup.delete('rel'), - 'iframe' => [ - 'allow', 'allowfullscreen', 'allowpaymentrequest', 'csp', 'height', 'loading', - 'name', 'referrerpolicy', 'sandbox', 'src', 'srcdoc', 'width', 'align', - 'frameborder', 'longdesc', 'marginheight', 'marginwidth', 'scrolling' - ] - }), - elements: Set.new(Sanitize::Config::RELAXED[:elements]) - ['div', 'style'] + ['iframe'], - remove_contents: ['math', 'noembed', 'noframes', 'noscript', 'plaintext', 'script', 'style', 'svg', 'xmp'], - whitespace_elements: { - 'div' => { :before => "", :after => "" } - } - )) - fragment = Nokogiri::HTML5.fragment(fragment) - if fragment.css('h1').any? - # h1 => h2 ; h2 => h3 ; ... - (1..5).to_a.reverse.each do |i| - fragment.css("h#{i}").each { |element| element.name = "h#{i+1}" } - end - end - html = fragment.to_html(preserve_newline: true) - html = remove_control_chars html - html - end - - def self.remove_control_chars(string) - # Control chars & LSEP are invisible or hard to detect - string = string.delete("
", "
", "
", "Â’") - string = string.gsub /\u2028/, '' - string - end - - def initialize(url) - @url = url - end - - def authors - load "#{url}/wp-json/wp/v2/users" - end - - def categories - load "#{url}/wp-json/wp/v2/categories" - end - - def posts - load "#{url}/wp-json/wp/v2/posts" - end - - def pages - load "#{url}/wp-json/wp/v2/pages" - end - - def media - load "#{url}/wp-json/wp/v2/media" - end - - protected - - def load(url) - page = 1 - posts = [] - loop do - batch = load_paged url, page - break if batch.is_a?(Hash) || batch.empty? - posts += batch - page += 1 - end - posts - end - - def load_paged(url, page) - puts "Load #{url } on page #{page}" - load_url "#{url}?page=#{page}&per_page=100" - end - - def load_url(url) - download_service = DownloadService.download(url) - JSON.parse(download_service.response.body) - end -end diff --git a/test/integration/wordpress_test.rb b/test/integration/wordpress_test.rb deleted file mode 100644 index 8ab63b607b573857e7b79040e48b339d81c14740..0000000000000000000000000000000000000000 --- a/test/integration/wordpress_test.rb +++ /dev/null @@ -1,67 +0,0 @@ -require "test_helper" - -class WordpressTest < ActiveSupport::TestCase - test "convert apostroph" do - assert_equal 'Ouverture du CRM pendant les vacances d’Avril', - Wordpress.clean_html('Ouverture du CRM pendant les vacances d’Avril') - end - - test "convert 3 dots" do - assert_equal 'Le CRM fait le tri dans ses collections … et vous propose une vente de livres', - Wordpress.clean_html('Le CRM fait le tri dans ses collections … et vous propose une vente de livres') - end - - test "convert double quotation marks" do - assert_equal 'Conférence Joëlle Zask : “Ecologie de la participationâ€', - Wordpress.clean_html('Conférence Joëlle Zask : “Ecologie de la participation”') - end - - test "convert h1" do - assert_equal '<h2>B.U.T. Métiers du multimédia et de l’internet</h2>', - Wordpress.clean_html('<h1>B.U.T. Métiers du multimédia et de l’internet</h1>') - end - - test "convert h2 without h1" do - assert_equal '<h2>B.U.T. Métiers du multimédia et de l’internet</h2>', - Wordpress.clean_html('<h2>B.U.T. Métiers du multimédia et de l’internet</h2>') - end - - test "convert h2 with h1" do - assert_equal '<h2>Bachelor Universitaire de Technologie</h2><h3>B.U.T. Métiers du multimédia et de l’internet</h3>', - Wordpress.clean_html('<h1>Bachelor Universitaire de Technologie</h1><h2>B.U.T. Métiers du multimédia et de l’internet</h2>') - end - - test "convert " do - assert_equal 'TRAVAILLER DEMAIN, Débat – le 10 mai à 18h30', - Wordpress.clean_html('TRAVAILLER DEMAIN, Débat – le 10 mai à 18h30') - end - - test "remove classes" do - assert_equal '<h2>→ Qu’est-ce que le B.U.T. ?</h2>', - Wordpress.clean_html('<h2 class="titre-diplome">→ Qu’est-ce que le B.U.T. ?</h2>') - end - - test "remove line_separators (LSEP)" do - # Invisible char before A, and html code - assert_equal "Au ", - Wordpress.clean_html("
Au 
") - end - - test "remove divs" do - # Quid des images ? Comment gérer le transfert vers scaleway + active storage dans le code ? - assert_equal '<figure><a href="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1.png"><img src="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1-240x300.png" alt="Le BUT, qu\'est-ce que c\'est ?" width="173" height="216" srcset="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1-240x300.png 240w, https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1.png 730w"></a></figure>', - Wordpress.clean_html('<div class="wp-block-group"><div class="wp-block-group__inner-container"><div class="wp-block-columns"><div class="wp-block-column"><div class="wp-block-image"><figure class="alignright size-medium is-resized"><a href="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1.png" rel="lightbox[14475]"><img src="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1-240x300.png" alt="Le BUT, qu\'est-ce que c\'est ?" class="wp-image-14821" width="173" height="216" srcset="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1-240x300.png 240w, https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1.png 730w"></a></figure></div></div>') - - - end - - test "convert in titles" do - assert_equal ' ', - Wordpress.clean_string(' ') - end - - test "authorize iframes" do - assert_equal "<figure><iframe loading=\"lazy\" title=\"Le Bachelor Universitaire de Technologie, qu'est-ce que c'est ? - LES IUT\" width=\"640\" height=\"360\" src=\"https://www.youtube.com/embed/5xbeKHi0txk?feature=oembed\" frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture\" allowfullscreen=\"\"></iframe></figure>", - Wordpress.clean_html('<figure class="wp-block-embed is-type-video is-provider-youtube wp-block-embed-youtube wp-embed-aspect-16-9 wp-has-aspect-ratio"><div class="wp-block-embed__wrapper"><iframe loading="lazy" title="Le Bachelor Universitaire de Technologie, qu'est-ce que c'est ? - LES IUT" width="640" height="360" src="https://www.youtube.com/embed/5xbeKHi0txk?feature=oembed" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe></div></figure>') - end -end diff --git a/test/services/importers/cleaner_test.rb b/test/services/importers/cleaner_test.rb new file mode 100644 index 0000000000000000000000000000000000000000..eb0cb64db9116e24542892a6395ccaa1e8f885d0 --- /dev/null +++ b/test/services/importers/cleaner_test.rb @@ -0,0 +1,67 @@ +require "test_helper" + +class Importers::CleanerTest < ActiveSupport::TestCase + test "convert apostroph" do + assert_equal 'Ouverture du CRM pendant les vacances d’Avril', + Importers::Cleaner.clean_html('Ouverture du CRM pendant les vacances d’Avril') + end + + test "convert 3 dots" do + assert_equal 'Le CRM fait le tri dans ses collections … et vous propose une vente de livres', + Importers::Cleaner.clean_html('Le CRM fait le tri dans ses collections … et vous propose une vente de livres') + end + + test "convert double quotation marks" do + assert_equal 'Conférence Joëlle Zask : “Ecologie de la participationâ€', + Importers::Cleaner.clean_html('Conférence Joëlle Zask : “Ecologie de la participation”') + end + + test "convert h1" do + assert_equal '<h2>B.U.T. Métiers du multimédia et de l’internet</h2>', + Importers::Cleaner.clean_html('<h1>B.U.T. Métiers du multimédia et de l’internet</h1>') + end + + test "convert h2 without h1" do + assert_equal '<h2>B.U.T. Métiers du multimédia et de l’internet</h2>', + Importers::Cleaner.clean_html('<h2>B.U.T. Métiers du multimédia et de l’internet</h2>') + end + + test "convert h2 with h1" do + assert_equal '<h2>Bachelor Universitaire de Technologie</h2><h3>B.U.T. Métiers du multimédia et de l’internet</h3>', + Importers::Cleaner.clean_html('<h1>Bachelor Universitaire de Technologie</h1><h2>B.U.T. Métiers du multimédia et de l’internet</h2>') + end + + test "convert " do + assert_equal 'TRAVAILLER DEMAIN, Débat – le 10 mai à 18h30', + Importers::Cleaner.clean_html('TRAVAILLER DEMAIN, Débat – le 10 mai à 18h30') + end + + test "remove classes" do + assert_equal '<h2>→ Qu’est-ce que le B.U.T. ?</h2>', + Importers::Cleaner.clean_html('<h2 class="titre-diplome">→ Qu’est-ce que le B.U.T. ?</h2>') + end + + test "remove line_separators (LSEP)" do + # Invisible char before A, and html code + assert_equal "Au ", + Importers::Cleaner.clean_html("
Au 
") + end + + test "remove divs" do + # Quid des images ? Comment gérer le transfert vers scaleway + active storage dans le code ? + assert_equal '<figure><a href="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1.png"><img src="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1-240x300.png" alt="Le BUT, qu\'est-ce que c\'est ?" width="173" height="216" srcset="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1-240x300.png 240w, https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1.png 730w"></a></figure>', + Importers::Cleaner.clean_html('<div class="wp-block-group"><div class="wp-block-group__inner-container"><div class="wp-block-columns"><div class="wp-block-column"><div class="wp-block-image"><figure class="alignright size-medium is-resized"><a href="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1.png" rel="lightbox[14475]"><img src="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1-240x300.png" alt="Le BUT, qu\'est-ce que c\'est ?" class="wp-image-14821" width="173" height="216" srcset="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1-240x300.png 240w, https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1.png 730w"></a></figure></div></div>') + + + end + + test "convert in titles" do + assert_equal ' ', + Importers::Cleaner.clean_string(' ') + end + + test "authorize iframes" do + assert_equal "<figure><iframe loading=\"lazy\" title=\"Le Bachelor Universitaire de Technologie, qu'est-ce que c'est ? - LES IUT\" width=\"640\" height=\"360\" src=\"https://www.youtube.com/embed/5xbeKHi0txk?feature=oembed\" frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture\" allowfullscreen=\"\"></iframe></figure>", + Importers::Cleaner.clean_html('<figure class="wp-block-embed is-type-video is-provider-youtube wp-block-embed-youtube wp-embed-aspect-16-9 wp-has-aspect-ratio"><div class="wp-block-embed__wrapper"><iframe loading="lazy" title="Le Bachelor Universitaire de Technologie, qu'est-ce que c'est ? - LES IUT" width="640" height="360" src="https://www.youtube.com/embed/5xbeKHi0txk?feature=oembed" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe></div></figure>') + end +end