From b907dee1d5eee57325da3fea4e20eac22d7f95da Mon Sep 17 00:00:00 2001
From: Arnaud Levy <contact@arnaudlevy.com>
Date: Thu, 19 Oct 2023 07:06:18 +0200
Subject: [PATCH] clean

---
 .../websites/posts/curations_controller.rb    |   2 +-
 app/services/curator.rb                       |  54 ----------
 app/services/download_service.rb              |  40 -------
 .../api/osuny/communication/website/page.rb   |   9 +-
 .../api/osuny/communication/website/post.rb   |  17 +--
 app/services/importers/cleaner.rb             |  63 +++++++++++
 app/services/importers/curator.rb             |  56 ++++++++++
 app/services/wordpress.rb                     | 100 ------------------
 test/integration/wordpress_test.rb            |  67 ------------
 test/services/importers/cleaner_test.rb       |  67 ++++++++++++
 10 files changed, 203 insertions(+), 272 deletions(-)
 delete mode 100644 app/services/curator.rb
 delete mode 100644 app/services/download_service.rb
 create mode 100644 app/services/importers/cleaner.rb
 create mode 100644 app/services/importers/curator.rb
 delete mode 100644 app/services/wordpress.rb
 delete mode 100644 test/integration/wordpress_test.rb
 create mode 100644 test/services/importers/cleaner_test.rb

diff --git a/app/controllers/admin/communication/websites/posts/curations_controller.rb b/app/controllers/admin/communication/websites/posts/curations_controller.rb
index 111b201ca..436b60dfc 100644
--- a/app/controllers/admin/communication/websites/posts/curations_controller.rb
+++ b/app/controllers/admin/communication/websites/posts/curations_controller.rb
@@ -4,7 +4,7 @@ class Admin::Communication::Websites::Posts::CurationsController < Admin::Commun
   end
 
   def create
-    @curator = Curator.new @website, current_user, current_website_language, curation_params[:url]
+    @curator = Importers::Curator.new @website, current_user, current_website_language, curation_params[:url]
     if @curator.valid?
       redirect_to [:admin, @curator.post],
                   notice: t('admin.successfully_created_html', model: @curator.post.to_s)
diff --git a/app/services/curator.rb b/app/services/curator.rb
deleted file mode 100644
index 4f7c50428..000000000
--- a/app/services/curator.rb
+++ /dev/null
@@ -1,54 +0,0 @@
-class Curator
-  attr_reader :website, :user, :language, :url, :post
-
-  def initialize(website, user, language, url)
-    @website = website
-    @user = user
-    @language = language
-    @url = url
-    create_post!
-    attach_image! unless page.image.blank?
-  rescue
-  end
-
-  def valid?
-    @post.valid?
-  end
-
-  protected
-
-  def create_post!
-    @post = website.posts.create(
-      university: website.university,
-      title: page.title,
-      slug: page.title.parameterize,
-      author: @user.person,
-      published_at: Time.now,
-      language_id: @language.id
-    )
-    @chapter = @post.blocks.create(
-      university: website.university,
-      template_kind: :chapter,
-      published: true,
-      position: 0
-    )
-    text = Wordpress.clean_html("#{page.text}<p><a href=\"#{@url}\" target=\"_blank\">Source</a></p>")
-    data = @chapter.data.deep_dup
-    data['text'] = text
-    @chapter.data = data
-    @chapter.save
-  end
-
-  def attach_image!
-    @post.featured_image.attach(
-      io: URI.open(page.image),
-      filename: File.basename(page.image).split('?').first
-    )
-  rescue
-    puts "Attach image failed"
-  end
-
-  def page
-    @page ||= Curation::Page.new(@url)
-  end
-end
diff --git a/app/services/download_service.rb b/app/services/download_service.rb
deleted file mode 100644
index e0469ccfc..000000000
--- a/app/services/download_service.rb
+++ /dev/null
@@ -1,40 +0,0 @@
-class DownloadService
-  attr_reader :response
-
-  def self.download(url)
-    new(url)
-  end
-
-  def initialize(url)
-    @url = url
-    process!
-  end
-
-  def attachable_data
-    { io: io, filename: filename, content_type: content_type }
-  end
-
-  def io
-    @io ||= StringIO.new(@response.body)
-  end
-
-  def filename
-    @filename ||= File.basename(@url)
-  end
-
-  def content_type
-    @content_type ||= @response['Content-Type']
-  end
-
-  protected
-
-  def process!
-    uri = URI(@url)
-    http = Net::HTTP.new(uri.host, uri.port)
-    http.use_ssl = true
-    http.verify_mode = OpenSSL::SSL::VERIFY_NONE
-
-    request = Net::HTTP::Get.new(uri.request_uri)
-    @response = http.request(request)
-  end
-end
\ No newline at end of file
diff --git a/app/services/importers/api/osuny/communication/website/page.rb b/app/services/importers/api/osuny/communication/website/page.rb
index 38f6b8286..48b139e1d 100644
--- a/app/services/importers/api/osuny/communication/website/page.rb
+++ b/app/services/importers/api/osuny/communication/website/page.rb
@@ -3,10 +3,15 @@ class Importers::Api::Osuny::Communication::Website::Page < Importers::Api::Osun
   protected
 
   def import
+    import_params
+    import_blocks
+  end
+
+  def import_params
+    object.title = Importers::Cleaner.clean_string params[:title]
+    object.summary = Importers::Cleaner.html_to_string params[:summary]
     object.parent = parent
-    object.update page_params
     object.save
-    import_blocks
   end
 
   def home_page
diff --git a/app/services/importers/api/osuny/communication/website/post.rb b/app/services/importers/api/osuny/communication/website/post.rb
index 71d924ecb..971ba20d1 100644
--- a/app/services/importers/api/osuny/communication/website/post.rb
+++ b/app/services/importers/api/osuny/communication/website/post.rb
@@ -3,8 +3,7 @@ class Importers::Api::Osuny::Communication::Website::Post < Importers::Api::Osun
   protected
 
   def import
-    object.update post_params
-    object.save
+    import_params
     import_blocks
     import_categories
   end
@@ -18,6 +17,14 @@ class Importers::Api::Osuny::Communication::Website::Post < Importers::Api::Osun
     ).first_or_initialize
   end
 
+  def import_params
+    object.title = Importers::Cleaner.clean_string params[:title]
+    object.summary = Importers::Cleaner.html_to_string params[:summary]
+    object.published_at = params[:published_at]
+    object.created_at = object.published_at
+    object.save
+  end
+
   def import_categories
     categories.each do |c|
       category = find_or_create_category c
@@ -41,10 +48,4 @@ class Importers::Api::Osuny::Communication::Website::Post < Importers::Api::Osun
     return [] unless params.has_key?(:categories)
     @categories ||= params[:categories]
   end
-
-  def post_params
-    ActionController::Parameters.new({ post: params })
-      .require(:post)
-      .permit(:title, :language, :meta_description, :summary)
-  end
 end
\ No newline at end of file
diff --git a/app/services/importers/cleaner.rb b/app/services/importers/cleaner.rb
new file mode 100644
index 000000000..ac6cf93f2
--- /dev/null
+++ b/app/services/importers/cleaner.rb
@@ -0,0 +1,63 @@
+module Importers
+  class Cleaner
+
+    def self.html_to_string(html)
+      h = html
+      h = Importers::Cleaner.clean_html h
+      h = ActionController::Base.helpers.strip_tags h
+      h
+    end
+
+    def self.clean_string(string)
+      string = string.gsub('&nbsp;', ' ')
+      string = string.gsub('&amp;', '&')
+      string = ActionView::Base.full_sanitizer.sanitize string
+      string = remove_control_chars string
+      string
+    end
+  
+    def self.clean_html(html)
+      # invalid byte sequence in UTF-8
+      # https://stackoverflow.com/questions/32826781/invalid-byte-sequence-in-utf-8-when-sanitizing-wordpress-export-content
+      html = html.force_encoding('UTF-8').scrub
+      # Relaxed config : https://github.com/rgrove/sanitize/blob/main/lib/sanitize/config/relaxed.rb
+      # iframe attributes from MDN : https://developer.mozilla.org/fr/docs/Web/HTML/Element/iframe
+      fragment = Sanitize.fragment(html, Sanitize::Config.merge(Sanitize::Config::RELAXED,
+        attributes: Sanitize::Config::RELAXED[:attributes].merge({
+          all: Sanitize::Config::RELAXED[:attributes][:all].dup - ['class', 'style'],
+          'a' => Sanitize::Config::RELAXED[:attributes]['a'].dup.delete('rel'),
+          'iframe' => [
+            'allow', 'allowfullscreen', 'allowpaymentrequest', 'csp', 'height', 'loading',
+            'name', 'referrerpolicy', 'sandbox', 'src', 'srcdoc', 'width', 'align',
+            'frameborder', 'longdesc', 'marginheight', 'marginwidth', 'scrolling'
+          ]
+        }),
+        elements: Set.new(Sanitize::Config::RELAXED[:elements]) - ['div', 'style'] + ['iframe'],
+        remove_contents: ['math', 'noembed', 'noframes', 'noscript', 'plaintext', 'script', 'style', 'svg', 'xmp'],
+        whitespace_elements: {
+          'div' => { :before => "", :after => "" }
+        }
+      ))
+      fragment = Nokogiri::HTML5.fragment(fragment)
+      if fragment.css('h1').any?
+        # h1 => h2 ; h2 => h3 ; ...
+        (1..5).to_a.reverse.each do |i|
+          fragment.css("h#{i}").each { |element| element.name = "h#{i+1}" }
+        end
+      end
+      html = fragment.to_html(preserve_newline: true)
+      html = remove_control_chars html
+      html
+    end
+
+    protected
+  
+    def self.remove_control_chars(string)
+      # Control chars & LSEP are invisible or hard to detect
+      string = string.delete("
", "&#8232;", "&#x2028;", "’")
+      string = string.gsub /\u2028/, ''
+      string
+    end
+
+  end
+end
\ No newline at end of file
diff --git a/app/services/importers/curator.rb b/app/services/importers/curator.rb
new file mode 100644
index 000000000..5e2631caa
--- /dev/null
+++ b/app/services/importers/curator.rb
@@ -0,0 +1,56 @@
+module Importers
+  class Curator
+    attr_reader :website, :user, :language, :url, :post
+
+    def initialize(website, user, language, url)
+      @website = website
+      @user = user
+      @language = language
+      @url = url
+      create_post!
+      attach_image! unless page.image.blank?
+    rescue
+    end
+
+    def valid?
+      @post.valid?
+    end
+
+    protected
+
+    def create_post!
+      @post = website.posts.create(
+        university: website.university,
+        title: page.title,
+        slug: page.title.parameterize,
+        author: @user.person,
+        published_at: Time.now,
+        language_id: @language.id
+      )
+      @chapter = @post.blocks.create(
+        university: website.university,
+        template_kind: :chapter,
+        published: true,
+        position: 0
+      )
+      text = Importers::Cleaner.clean_html("#{page.text}<p><a href=\"#{@url}\" target=\"_blank\">Source</a></p>")
+      data = @chapter.data.deep_dup
+      data['text'] = text
+      @chapter.data = data
+      @chapter.save
+    end
+
+    def attach_image!
+      @post.featured_image.attach(
+        io: URI.open(page.image),
+        filename: File.basename(page.image).split('?').first
+      )
+    rescue
+      puts "Attach image failed"
+    end
+
+    def page
+      @page ||= Curation::Page.new(@url)
+    end
+  end
+end
\ No newline at end of file
diff --git a/app/services/wordpress.rb b/app/services/wordpress.rb
deleted file mode 100644
index 2b0240d70..000000000
--- a/app/services/wordpress.rb
+++ /dev/null
@@ -1,100 +0,0 @@
-class Wordpress
-  attr_reader :url
-
-  def self.clean_string(string)
-    string = string.gsub('&nbsp;', ' ')
-    string = string.gsub('&amp;', '&')
-    string = ActionView::Base.full_sanitizer.sanitize string
-    string = remove_control_chars string
-    string
-  end
-
-  def self.clean_html(html)
-    # invalid byte sequence in UTF-8
-    # https://stackoverflow.com/questions/32826781/invalid-byte-sequence-in-utf-8-when-sanitizing-wordpress-export-content
-    html = html.force_encoding('UTF-8').scrub
-    # Relaxed config : https://github.com/rgrove/sanitize/blob/main/lib/sanitize/config/relaxed.rb
-    # iframe attributes from MDN : https://developer.mozilla.org/fr/docs/Web/HTML/Element/iframe
-    fragment = Sanitize.fragment(html, Sanitize::Config.merge(Sanitize::Config::RELAXED,
-      attributes: Sanitize::Config::RELAXED[:attributes].merge({
-        all: Sanitize::Config::RELAXED[:attributes][:all].dup - ['class', 'style'],
-        'a' => Sanitize::Config::RELAXED[:attributes]['a'].dup.delete('rel'),
-        'iframe' => [
-          'allow', 'allowfullscreen', 'allowpaymentrequest', 'csp', 'height', 'loading',
-          'name', 'referrerpolicy', 'sandbox', 'src', 'srcdoc', 'width', 'align',
-          'frameborder', 'longdesc', 'marginheight', 'marginwidth', 'scrolling'
-        ]
-      }),
-      elements: Set.new(Sanitize::Config::RELAXED[:elements]) - ['div', 'style'] + ['iframe'],
-      remove_contents: ['math', 'noembed', 'noframes', 'noscript', 'plaintext', 'script', 'style', 'svg', 'xmp'],
-      whitespace_elements: {
-        'div' => { :before => "", :after => "" }
-      }
-    ))
-    fragment = Nokogiri::HTML5.fragment(fragment)
-    if fragment.css('h1').any?
-      # h1 => h2 ; h2 => h3 ; ...
-      (1..5).to_a.reverse.each do |i|
-        fragment.css("h#{i}").each { |element| element.name = "h#{i+1}" }
-      end
-    end
-    html = fragment.to_html(preserve_newline: true)
-    html = remove_control_chars html
-    html
-  end
-
-  def self.remove_control_chars(string)
-    # Control chars & LSEP are invisible or hard to detect
-    string = string.delete("
", "&#8232;", "&#x2028;", "’")
-    string = string.gsub /\u2028/, ''
-    string
-  end
-
-  def initialize(url)
-    @url = url
-  end
-
-  def authors
-    load "#{url}/wp-json/wp/v2/users"
-  end
-
-  def categories
-    load "#{url}/wp-json/wp/v2/categories"
-  end
-
-  def posts
-    load "#{url}/wp-json/wp/v2/posts"
-  end
-
-  def pages
-    load "#{url}/wp-json/wp/v2/pages"
-  end
-
-  def media
-    load "#{url}/wp-json/wp/v2/media"
-  end
-
-  protected
-
-  def load(url)
-    page = 1
-    posts = []
-    loop do
-      batch = load_paged url, page
-      break if batch.is_a?(Hash) || batch.empty?
-      posts += batch
-      page += 1
-    end
-    posts
-  end
-
-  def load_paged(url, page)
-    puts "Load #{url } on page #{page}"
-    load_url "#{url}?page=#{page}&per_page=100"
-  end
-
-  def load_url(url)
-    download_service = DownloadService.download(url)
-    JSON.parse(download_service.response.body)
-  end
-end
diff --git a/test/integration/wordpress_test.rb b/test/integration/wordpress_test.rb
deleted file mode 100644
index 8ab63b607..000000000
--- a/test/integration/wordpress_test.rb
+++ /dev/null
@@ -1,67 +0,0 @@
-require "test_helper"
-
-class WordpressTest < ActiveSupport::TestCase
-  test "convert apostroph" do
-    assert_equal  'Ouverture du CRM pendant les vacances d’Avril',
-                  Wordpress.clean_html('Ouverture du CRM pendant les vacances d&#8217;Avril')
-  end
-
-  test "convert 3 dots" do
-    assert_equal  'Le CRM fait le tri dans ses collections … et vous propose une vente de livres',
-                  Wordpress.clean_html('Le CRM fait le tri dans ses collections &#8230; et vous propose une vente de livres')
-  end
-
-  test "convert double quotation marks" do
-    assert_equal  'Conférence Joëlle Zask : “Ecologie de la participation”',
-                  Wordpress.clean_html('Conférence Joëlle Zask : &#8220;Ecologie de la participation&#8221;')
-  end
-
-  test "convert h1" do
-    assert_equal  '<h2>B.U.T. Métiers du multimédia et de l’internet</h2>',
-                  Wordpress.clean_html('<h1>B.U.T. Métiers du multimédia et de l&#8217;internet</h1>')
-  end
-
-  test "convert h2 without h1" do
-    assert_equal  '<h2>B.U.T. Métiers du multimédia et de l’internet</h2>',
-                  Wordpress.clean_html('<h2>B.U.T. Métiers du multimédia et de l&#8217;internet</h2>')
-  end
-
-  test "convert h2 with h1" do
-    assert_equal  '<h2>Bachelor Universitaire de Technologie</h2><h3>B.U.T. Métiers du multimédia et de l’internet</h3>',
-                  Wordpress.clean_html('<h1>Bachelor Universitaire de Technologie</h1><h2>B.U.T. Métiers du multimédia et de l&#8217;internet</h2>')
-  end
-
-  test "convert " do
-    assert_equal  'TRAVAILLER DEMAIN, Débat – le 10 mai à 18h30',
-                  Wordpress.clean_html('TRAVAILLER DEMAIN, Débat &#8211; le 10 mai à 18h30')
-  end
-
-  test "remove classes" do
-    assert_equal  '<h2>→ Qu’est-ce que le B.U.T.&nbsp;?</h2>',
-                  Wordpress.clean_html('<h2 class="titre-diplome">→ Qu’est-ce que le B.U.T.&nbsp;?</h2>')
-  end
-
-  test "remove line_separators (LSEP)" do
-    # Invisible char before A, and html code
-    assert_equal  "Au ",
-                  Wordpress.clean_html("
Au &#8232;")
-  end
-
-  test "remove divs" do
-    # Quid des images ? Comment gérer le transfert vers scaleway + active storage dans le code ?
-    assert_equal  '<figure><a href="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1.png"><img src="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1-240x300.png" alt="Le BUT, qu\'est-ce que c\'est ?" width="173" height="216" srcset="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1-240x300.png 240w, https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1.png 730w"></a></figure>',
-                  Wordpress.clean_html('<div class="wp-block-group"><div class="wp-block-group__inner-container"><div class="wp-block-columns"><div class="wp-block-column"><div class="wp-block-image"><figure class="alignright size-medium is-resized"><a href="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1.png" rel="lightbox[14475]"><img src="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1-240x300.png" alt="Le BUT, qu\'est-ce que c\'est ?" class="wp-image-14821" width="173" height="216" srcset="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1-240x300.png 240w, https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1.png 730w"></a></figure></div></div>')
-
-
-  end
-
-  test "convert &nbsp; in titles" do
-    assert_equal  ' ',
-                  Wordpress.clean_string('&nbsp;')
-  end
-
-  test "authorize iframes" do
-    assert_equal "<figure><iframe loading=\"lazy\" title=\"Le Bachelor Universitaire de Technologie, qu'est-ce que c'est ? - LES IUT\" width=\"640\" height=\"360\" src=\"https://www.youtube.com/embed/5xbeKHi0txk?feature=oembed\" frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture\" allowfullscreen=\"\"></iframe></figure>",
-                 Wordpress.clean_html('<figure class="wp-block-embed is-type-video is-provider-youtube wp-block-embed-youtube wp-embed-aspect-16-9 wp-has-aspect-ratio"><div class="wp-block-embed__wrapper"><iframe loading="lazy" title="Le Bachelor Universitaire de Technologie, qu&#039;est-ce que c&#039;est ? - LES IUT" width="640" height="360" src="https://www.youtube.com/embed/5xbeKHi0txk?feature=oembed" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe></div></figure>')
-  end
-end
diff --git a/test/services/importers/cleaner_test.rb b/test/services/importers/cleaner_test.rb
new file mode 100644
index 000000000..eb0cb64db
--- /dev/null
+++ b/test/services/importers/cleaner_test.rb
@@ -0,0 +1,67 @@
+require "test_helper"
+
+class Importers::CleanerTest < ActiveSupport::TestCase
+  test "convert apostroph" do
+    assert_equal  'Ouverture du CRM pendant les vacances d’Avril',
+                  Importers::Cleaner.clean_html('Ouverture du CRM pendant les vacances d&#8217;Avril')
+  end
+
+  test "convert 3 dots" do
+    assert_equal  'Le CRM fait le tri dans ses collections … et vous propose une vente de livres',
+                  Importers::Cleaner.clean_html('Le CRM fait le tri dans ses collections &#8230; et vous propose une vente de livres')
+  end
+
+  test "convert double quotation marks" do
+    assert_equal  'Conférence Joëlle Zask : “Ecologie de la participation”',
+                  Importers::Cleaner.clean_html('Conférence Joëlle Zask : &#8220;Ecologie de la participation&#8221;')
+  end
+
+  test "convert h1" do
+    assert_equal  '<h2>B.U.T. Métiers du multimédia et de l’internet</h2>',
+                  Importers::Cleaner.clean_html('<h1>B.U.T. Métiers du multimédia et de l&#8217;internet</h1>')
+  end
+
+  test "convert h2 without h1" do
+    assert_equal  '<h2>B.U.T. Métiers du multimédia et de l’internet</h2>',
+                  Importers::Cleaner.clean_html('<h2>B.U.T. Métiers du multimédia et de l&#8217;internet</h2>')
+  end
+
+  test "convert h2 with h1" do
+    assert_equal  '<h2>Bachelor Universitaire de Technologie</h2><h3>B.U.T. Métiers du multimédia et de l’internet</h3>',
+                  Importers::Cleaner.clean_html('<h1>Bachelor Universitaire de Technologie</h1><h2>B.U.T. Métiers du multimédia et de l&#8217;internet</h2>')
+  end
+
+  test "convert " do
+    assert_equal  'TRAVAILLER DEMAIN, Débat – le 10 mai à 18h30',
+                  Importers::Cleaner.clean_html('TRAVAILLER DEMAIN, Débat &#8211; le 10 mai à 18h30')
+  end
+
+  test "remove classes" do
+    assert_equal  '<h2>→ Qu’est-ce que le B.U.T.&nbsp;?</h2>',
+                  Importers::Cleaner.clean_html('<h2 class="titre-diplome">→ Qu’est-ce que le B.U.T.&nbsp;?</h2>')
+  end
+
+  test "remove line_separators (LSEP)" do
+    # Invisible char before A, and html code
+    assert_equal  "Au ",
+                  Importers::Cleaner.clean_html("
Au &#8232;")
+  end
+
+  test "remove divs" do
+    # Quid des images ? Comment gérer le transfert vers scaleway + active storage dans le code ?
+    assert_equal  '<figure><a href="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1.png"><img src="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1-240x300.png" alt="Le BUT, qu\'est-ce que c\'est ?" width="173" height="216" srcset="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1-240x300.png 240w, https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1.png 730w"></a></figure>',
+                  Importers::Cleaner.clean_html('<div class="wp-block-group"><div class="wp-block-group__inner-container"><div class="wp-block-columns"><div class="wp-block-column"><div class="wp-block-image"><figure class="alignright size-medium is-resized"><a href="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1.png" rel="lightbox[14475]"><img src="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1-240x300.png" alt="Le BUT, qu\'est-ce que c\'est ?" class="wp-image-14821" width="173" height="216" srcset="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1-240x300.png 240w, https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1.png 730w"></a></figure></div></div>')
+
+
+  end
+
+  test "convert &nbsp; in titles" do
+    assert_equal  ' ',
+                  Importers::Cleaner.clean_string('&nbsp;')
+  end
+
+  test "authorize iframes" do
+    assert_equal "<figure><iframe loading=\"lazy\" title=\"Le Bachelor Universitaire de Technologie, qu'est-ce que c'est ? - LES IUT\" width=\"640\" height=\"360\" src=\"https://www.youtube.com/embed/5xbeKHi0txk?feature=oembed\" frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture\" allowfullscreen=\"\"></iframe></figure>",
+                 Importers::Cleaner.clean_html('<figure class="wp-block-embed is-type-video is-provider-youtube wp-block-embed-youtube wp-embed-aspect-16-9 wp-has-aspect-ratio"><div class="wp-block-embed__wrapper"><iframe loading="lazy" title="Le Bachelor Universitaire de Technologie, qu&#039;est-ce que c&#039;est ? - LES IUT" width="640" height="360" src="https://www.youtube.com/embed/5xbeKHi0txk?feature=oembed" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe></div></figure>')
+  end
+end
-- 
GitLab