diff --git a/app/controllers/admin/communication/websites/posts/curations_controller.rb b/app/controllers/admin/communication/websites/posts/curations_controller.rb index 111b201ca983bcdff52ddcaab87b9036a5123c46..436b60dfc08bd03ffd7efda246e2cdb059c3aa79 100644 --- a/app/controllers/admin/communication/websites/posts/curations_controller.rb +++ b/app/controllers/admin/communication/websites/posts/curations_controller.rb @@ -4,7 +4,7 @@ class Admin::Communication::Websites::Posts::CurationsController < Admin::Commun end def create - @curator = Curator.new @website, current_user, current_website_language, curation_params[:url] + @curator = Importers::Curator.new @website, current_user, current_website_language, curation_params[:url] if @curator.valid? redirect_to [:admin, @curator.post], notice: t('admin.successfully_created_html', model: @curator.post.to_s) diff --git a/app/controllers/api/osuny/application_controller.rb b/app/controllers/api/osuny/application_controller.rb index 74e5ba47e8acb595529a58bf52c2eb90195d2ecc..5f3111aecfbca56e4eeb7d5fc8a2f7ed3899c2b4 100644 --- a/app/controllers/api/osuny/application_controller.rb +++ b/app/controllers/api/osuny/application_controller.rb @@ -1,8 +1,10 @@ class Api::Osuny::ApplicationController < Api::ApplicationController + protected def verify_app_token @app = current_university.apps.find_by(token: request.headers['X-Osuny-Token']) raise_403_unless @app end + end \ No newline at end of file diff --git a/app/controllers/api/osuny/communication/websites/application_controller.rb b/app/controllers/api/osuny/communication/websites/application_controller.rb new file mode 100644 index 0000000000000000000000000000000000000000..c90b5342523f93584d5ed8ed65a63e311317f414 --- /dev/null +++ b/app/controllers/api/osuny/communication/websites/application_controller.rb @@ -0,0 +1,9 @@ +class Api::Osuny::Communication::Websites::ApplicationController < Api::Osuny::ApplicationController + + protected + + def website + @website ||= current_university.websites.find params[:website_id] + end + +end diff --git a/app/controllers/api/osuny/communication/websites/pages_controller.rb b/app/controllers/api/osuny/communication/websites/pages_controller.rb new file mode 100644 index 0000000000000000000000000000000000000000..ec16240c53da100e2253e9bbede77f1b8c3d6331 --- /dev/null +++ b/app/controllers/api/osuny/communication/websites/pages_controller.rb @@ -0,0 +1,11 @@ +class Api::Osuny::Communication::Websites::PagesController < Api::Osuny::Communication::Websites::ApplicationController + skip_before_action :verify_authenticity_token, only: :import + before_action :verify_app_token, only: :import + + def import + Importers::Api::Osuny::Communication::Website::Page.new university: current_university, + website: website, + params: params[:page] + render json: :ok + end +end diff --git a/app/controllers/api/osuny/communication/websites/posts_controller.rb b/app/controllers/api/osuny/communication/websites/posts_controller.rb index 9047d91a0bc231c4684a150d40968a0f0c5361e6..0b04dacf07e79575b872dec2b4dd66a9098cea34 100644 --- a/app/controllers/api/osuny/communication/websites/posts_controller.rb +++ b/app/controllers/api/osuny/communication/websites/posts_controller.rb @@ -1,66 +1,12 @@ -class Api::Osuny::Communication::Websites::PostsController < Api::Osuny::ApplicationController +class Api::Osuny::Communication::Websites::PostsController < Api::Osuny::Communication::Websites::ApplicationController skip_before_action :verify_authenticity_token, only: :import before_action :verify_app_token, only: :import def import - create_post - import_blocks + Importers::Api::Osuny::Communication::Website::Post.new university: current_university, + website: website, + params: params[:post] render json: :ok end - protected - - def create_post - post.language = website.default_language - post.update post_params - post.save - end - - def post - @post ||= website.posts - .where( - university: current_university, - website: website, - migration_identifier: migration_identifier - ) - .first_or_initialize - end - - def import_blocks - blocks.each do |b| - migration_identifier = b[:migration_identifier] - template_kind = b[:template_kind] - block = post.blocks - .where( - template_kind: template_kind, - migration_identifier: migration_identifier - ) - .first_or_initialize - block.university = current_university - data = b[:data].to_unsafe_hash - block.data = block.template.data.merge data - block.save - end - end - - def blocks - return [] unless params[:post].has_key?(:blocks) - @blocks ||= params[:post][:blocks] - end - - def website - @website ||= current_university.websites.find params[:website_id] - end - - def migration_identifier - @migration_identifier ||= params[:migration_identifier] - end - - def post_params - params.require(:post) - .permit( - :title, :language, :meta_description, :summary, - ) - end - end diff --git a/app/models/communication/website/page.rb b/app/models/communication/website/page.rb index d10eb394149268db637a0f3c98fbe7bfc21147aa..d7f7baeedc84a7d953cf6e0c9ae111c03659829d 100644 --- a/app/models/communication/website/page.rb +++ b/app/models/communication/website/page.rb @@ -11,6 +11,7 @@ # header_text :text # kind :integer # meta_description :text +# migration_identifier :string # position :integer default(0), not null # published :boolean default(FALSE) # slug :string indexed diff --git a/app/models/communication/website/page/accessibility.rb b/app/models/communication/website/page/accessibility.rb index 9905f2880bf9f857e0acefc0887df64ee598be61..6c0928dd50554288fe14fd36d40b8f8351623cef 100644 --- a/app/models/communication/website/page/accessibility.rb +++ b/app/models/communication/website/page/accessibility.rb @@ -11,6 +11,7 @@ # header_text :text # kind :integer # meta_description :text +# migration_identifier :string # position :integer default(0), not null # published :boolean default(FALSE) # slug :string indexed diff --git a/app/models/communication/website/page/administrator.rb b/app/models/communication/website/page/administrator.rb index ca6144eade91f91c98b17053ded8c394b9cc0112..df545493462dc27eeb09614b202e8376aef311b4 100644 --- a/app/models/communication/website/page/administrator.rb +++ b/app/models/communication/website/page/administrator.rb @@ -11,6 +11,7 @@ # header_text :text # kind :integer # meta_description :text +# migration_identifier :string # position :integer default(0), not null # published :boolean default(FALSE) # slug :string indexed diff --git a/app/models/communication/website/page/author.rb b/app/models/communication/website/page/author.rb index dd81fa6dbe901b31501860039c397d8b22b0d103..d8fff0dfa6c86c7b6f350da19112dbc191753008 100644 --- a/app/models/communication/website/page/author.rb +++ b/app/models/communication/website/page/author.rb @@ -11,6 +11,7 @@ # header_text :text # kind :integer # meta_description :text +# migration_identifier :string # position :integer default(0), not null # published :boolean default(FALSE) # slug :string indexed diff --git a/app/models/communication/website/page/communication_agenda.rb b/app/models/communication/website/page/communication_agenda.rb index 4e17c15e04162d22fee8c2df61e41aa0abecaa10..48c0fc6401c2ffab35bae56647ee2e736cd5fa54 100644 --- a/app/models/communication/website/page/communication_agenda.rb +++ b/app/models/communication/website/page/communication_agenda.rb @@ -11,6 +11,7 @@ # header_text :text # kind :integer # meta_description :text +# migration_identifier :string # position :integer default(0), not null # published :boolean default(FALSE) # slug :string indexed diff --git a/app/models/communication/website/page/communication_agenda_archive.rb b/app/models/communication/website/page/communication_agenda_archive.rb index 822731a13b6da15b49b878638cbb6989a5161d46..018f0b0e80fc33fd85dbfa0b6336f78aad890275 100644 --- a/app/models/communication/website/page/communication_agenda_archive.rb +++ b/app/models/communication/website/page/communication_agenda_archive.rb @@ -11,6 +11,7 @@ # header_text :text # kind :integer # meta_description :text +# migration_identifier :string # position :integer default(0), not null # published :boolean default(FALSE) # slug :string indexed diff --git a/app/models/communication/website/page/communication_post.rb b/app/models/communication/website/page/communication_post.rb index 7d08bc227ebf4805206da7ac54cc6045313d1a49..1ab6d6b24abe2f836a1ed92ede62317911ea30e3 100644 --- a/app/models/communication/website/page/communication_post.rb +++ b/app/models/communication/website/page/communication_post.rb @@ -11,6 +11,7 @@ # header_text :text # kind :integer # meta_description :text +# migration_identifier :string # position :integer default(0), not null # published :boolean default(FALSE) # slug :string indexed diff --git a/app/models/communication/website/page/education_diploma.rb b/app/models/communication/website/page/education_diploma.rb index 416c168f758f92179f229d53b95321bff6131e76..a3205229dcb789c01d3de3860f9b7093cf137c2e 100644 --- a/app/models/communication/website/page/education_diploma.rb +++ b/app/models/communication/website/page/education_diploma.rb @@ -11,6 +11,7 @@ # header_text :text # kind :integer # meta_description :text +# migration_identifier :string # position :integer default(0), not null # published :boolean default(FALSE) # slug :string indexed diff --git a/app/models/communication/website/page/education_program.rb b/app/models/communication/website/page/education_program.rb index 6a1e2827046b2252cf570ef5914c1bbb395b7a49..6712e72d8e0a9dfea6a65d19cbae4ae6c1863447 100644 --- a/app/models/communication/website/page/education_program.rb +++ b/app/models/communication/website/page/education_program.rb @@ -11,6 +11,7 @@ # header_text :text # kind :integer # meta_description :text +# migration_identifier :string # position :integer default(0), not null # published :boolean default(FALSE) # slug :string indexed diff --git a/app/models/communication/website/page/home.rb b/app/models/communication/website/page/home.rb index 8c944b1248ae956c77169ea77b1a968886a041ca..c75c3aca6673f536ab8bde22359143470c31896a 100644 --- a/app/models/communication/website/page/home.rb +++ b/app/models/communication/website/page/home.rb @@ -11,6 +11,7 @@ # header_text :text # kind :integer # meta_description :text +# migration_identifier :string # position :integer default(0), not null # published :boolean default(FALSE) # slug :string indexed diff --git a/app/models/communication/website/page/legal_term.rb b/app/models/communication/website/page/legal_term.rb index 91deae0750e8bf5fccf1693ea9016c5dc1d1cb6f..8f187975e66609b6dae32e95c3393450cc82b4ae 100644 --- a/app/models/communication/website/page/legal_term.rb +++ b/app/models/communication/website/page/legal_term.rb @@ -11,6 +11,7 @@ # header_text :text # kind :integer # meta_description :text +# migration_identifier :string # position :integer default(0), not null # published :boolean default(FALSE) # slug :string indexed diff --git a/app/models/communication/website/page/organization.rb b/app/models/communication/website/page/organization.rb index f160ede8c709f85d331440b4a44c4420af834a99..b17a2c96df03ca8f4819200d84a799838bdfbaef 100644 --- a/app/models/communication/website/page/organization.rb +++ b/app/models/communication/website/page/organization.rb @@ -11,6 +11,7 @@ # header_text :text # kind :integer # meta_description :text +# migration_identifier :string # position :integer default(0), not null # published :boolean default(FALSE) # slug :string indexed diff --git a/app/models/communication/website/page/person.rb b/app/models/communication/website/page/person.rb index 175d5ec7aa62fe1fb670803c912656709c0e660f..4c11cd187e8f3ffe3983031d02bd4906cf685d39 100644 --- a/app/models/communication/website/page/person.rb +++ b/app/models/communication/website/page/person.rb @@ -11,6 +11,7 @@ # header_text :text # kind :integer # meta_description :text +# migration_identifier :string # position :integer default(0), not null # published :boolean default(FALSE) # slug :string indexed diff --git a/app/models/communication/website/page/privacy_policy.rb b/app/models/communication/website/page/privacy_policy.rb index f1508624d2169413ec48d6977b1a86a9598b5a10..a5ae32aded565f1588ff053175c47c0e20c6c4b0 100644 --- a/app/models/communication/website/page/privacy_policy.rb +++ b/app/models/communication/website/page/privacy_policy.rb @@ -11,6 +11,7 @@ # header_text :text # kind :integer # meta_description :text +# migration_identifier :string # position :integer default(0), not null # published :boolean default(FALSE) # slug :string indexed diff --git a/app/models/communication/website/page/research_hal_publication.rb b/app/models/communication/website/page/research_hal_publication.rb index c6d3c4dee4005907fb55f7ca8b094eb57c8b5ca1..0e4a31a61df98a7dd1775a7ad8a78b886fbd27fa 100644 --- a/app/models/communication/website/page/research_hal_publication.rb +++ b/app/models/communication/website/page/research_hal_publication.rb @@ -11,6 +11,7 @@ # header_text :text # kind :integer # meta_description :text +# migration_identifier :string # position :integer default(0), not null # published :boolean default(FALSE) # slug :string indexed diff --git a/app/models/communication/website/page/research_paper.rb b/app/models/communication/website/page/research_paper.rb index 4312f9faf2c0c489aa9b35fb6fd76cc467514d8c..d8e10336593dc0ca1c02b444f171160e9c2aa80f 100644 --- a/app/models/communication/website/page/research_paper.rb +++ b/app/models/communication/website/page/research_paper.rb @@ -11,6 +11,7 @@ # header_text :text # kind :integer # meta_description :text +# migration_identifier :string # position :integer default(0), not null # published :boolean default(FALSE) # slug :string indexed diff --git a/app/models/communication/website/page/research_volume.rb b/app/models/communication/website/page/research_volume.rb index a4f383ee94591c114d8508d02f06da2fcf383c50..e732c2b7d8ffeb7f2e25940e9abb560a004c2c1a 100644 --- a/app/models/communication/website/page/research_volume.rb +++ b/app/models/communication/website/page/research_volume.rb @@ -11,6 +11,7 @@ # header_text :text # kind :integer # meta_description :text +# migration_identifier :string # position :integer default(0), not null # published :boolean default(FALSE) # slug :string indexed diff --git a/app/models/communication/website/page/researcher.rb b/app/models/communication/website/page/researcher.rb index c5ac173da59f5fa282ab2ef07f59d33c5c808af0..a82afe69a19a0a306ef64add4a08495a7c2ed86c 100644 --- a/app/models/communication/website/page/researcher.rb +++ b/app/models/communication/website/page/researcher.rb @@ -11,6 +11,7 @@ # header_text :text # kind :integer # meta_description :text +# migration_identifier :string # position :integer default(0), not null # published :boolean default(FALSE) # slug :string indexed diff --git a/app/models/communication/website/page/sitemap.rb b/app/models/communication/website/page/sitemap.rb index fe79d1768f24777ba90962f05b3f5b6ac6ae4674..cf8ce880f93f244d9ec5db07b3e8add146d76c94 100644 --- a/app/models/communication/website/page/sitemap.rb +++ b/app/models/communication/website/page/sitemap.rb @@ -11,6 +11,7 @@ # header_text :text # kind :integer # meta_description :text +# migration_identifier :string # position :integer default(0), not null # published :boolean default(FALSE) # slug :string indexed diff --git a/app/models/communication/website/page/teacher.rb b/app/models/communication/website/page/teacher.rb index 3b55bacc26db6160aedff1821b59fcfaa52cd875..ad29eedad7063952ad2dbecc8891454ee9424cd2 100644 --- a/app/models/communication/website/page/teacher.rb +++ b/app/models/communication/website/page/teacher.rb @@ -11,6 +11,7 @@ # header_text :text # kind :integer # meta_description :text +# migration_identifier :string # position :integer default(0), not null # published :boolean default(FALSE) # slug :string indexed diff --git a/app/services/curator.rb b/app/services/curator.rb deleted file mode 100644 index 4f7c5042800995657d8cb46f6614626fb6e8a485..0000000000000000000000000000000000000000 --- a/app/services/curator.rb +++ /dev/null @@ -1,54 +0,0 @@ -class Curator - attr_reader :website, :user, :language, :url, :post - - def initialize(website, user, language, url) - @website = website - @user = user - @language = language - @url = url - create_post! - attach_image! unless page.image.blank? - rescue - end - - def valid? - @post.valid? - end - - protected - - def create_post! - @post = website.posts.create( - university: website.university, - title: page.title, - slug: page.title.parameterize, - author: @user.person, - published_at: Time.now, - language_id: @language.id - ) - @chapter = @post.blocks.create( - university: website.university, - template_kind: :chapter, - published: true, - position: 0 - ) - text = Wordpress.clean_html("#{page.text}<p><a href=\"#{@url}\" target=\"_blank\">Source</a></p>") - data = @chapter.data.deep_dup - data['text'] = text - @chapter.data = data - @chapter.save - end - - def attach_image! - @post.featured_image.attach( - io: URI.open(page.image), - filename: File.basename(page.image).split('?').first - ) - rescue - puts "Attach image failed" - end - - def page - @page ||= Curation::Page.new(@url) - end -end diff --git a/app/services/download_service.rb b/app/services/download_service.rb deleted file mode 100644 index e0469ccfce588827aca48a6ea842a22bc1953497..0000000000000000000000000000000000000000 --- a/app/services/download_service.rb +++ /dev/null @@ -1,40 +0,0 @@ -class DownloadService - attr_reader :response - - def self.download(url) - new(url) - end - - def initialize(url) - @url = url - process! - end - - def attachable_data - { io: io, filename: filename, content_type: content_type } - end - - def io - @io ||= StringIO.new(@response.body) - end - - def filename - @filename ||= File.basename(@url) - end - - def content_type - @content_type ||= @response['Content-Type'] - end - - protected - - def process! - uri = URI(@url) - http = Net::HTTP.new(uri.host, uri.port) - http.use_ssl = true - http.verify_mode = OpenSSL::SSL::VERIFY_NONE - - request = Net::HTTP::Get.new(uri.request_uri) - @response = http.request(request) - end -end \ No newline at end of file diff --git a/app/services/importers/api/osuny/communication/website/base.rb b/app/services/importers/api/osuny/communication/website/base.rb new file mode 100644 index 0000000000000000000000000000000000000000..22524bcfa459708dbfa5d36fe557fece635a8d4f --- /dev/null +++ b/app/services/importers/api/osuny/communication/website/base.rb @@ -0,0 +1,51 @@ +class Importers::Api::Osuny::Communication::Website::Base + + attr_reader :university, :website, :params + + def initialize(university:, website:, params:) + @university = university + @website = website + @params = params.to_unsafe_hash + import + end + + protected + + def import + raise NotImplementedError + end + + def object + raise NotImplementedError + end + + def language + # TODO specific language set in params + website.default_language + end + + def migration_identifier + @migration_identifier ||= params[:migration_identifier] + end + + def blocks + return [] unless params.has_key?(:blocks) + @blocks ||= params[:blocks] + end + + def import_blocks + blocks.each do |b| + migration_identifier = b[:migration_identifier] + template_kind = b[:template_kind] + block = object.blocks + .where( + migration_identifier: migration_identifier, + template_kind: template_kind + ) + .first_or_initialize + block.university = university + block.data = block.template.data.merge b[:data] + block.save + end + end +end \ No newline at end of file diff --git a/app/services/importers/api/osuny/communication/website/page.rb b/app/services/importers/api/osuny/communication/website/page.rb new file mode 100644 index 0000000000000000000000000000000000000000..48b139e1db2a3176f1cffa9842074320bd0b0580 --- /dev/null +++ b/app/services/importers/api/osuny/communication/website/page.rb @@ -0,0 +1,46 @@ +class Importers::Api::Osuny::Communication::Website::Page < Importers::Api::Osuny::Communication::Website::Base + + protected + + def import + import_params + import_blocks + end + + def import_params + object.title = Importers::Cleaner.clean_string params[:title] + object.summary = Importers::Cleaner.html_to_string params[:summary] + object.parent = parent + object.save + end + + def home_page + website.special_page(Communication::Website::Page::Home, language: language) + end + + def parent + parent_migration_identifier = params.dig(:parent, :migration_identifier) + @parent = page_with parent_migration_identifier if parent_migration_identifier + @parent = home_page if @parent.nil? + @parent + end + + def object + @object ||= page_with migration_identifier + end + + def page_with(migration_identifier) + website.pages.where( + university: university, + website: website, + migration_identifier: migration_identifier, + language: language + ).first_or_initialize + end + + def page_params + ActionController::Parameters.new({ page: params }) + .require(:page) + .permit(:title, :language, :meta_description, :summary) + end +end \ No newline at end of file diff --git a/app/services/importers/api/osuny/communication/website/post.rb b/app/services/importers/api/osuny/communication/website/post.rb new file mode 100644 index 0000000000000000000000000000000000000000..971ba20d1ed6d1faac822989ece0f10edd3da2c9 --- /dev/null +++ b/app/services/importers/api/osuny/communication/website/post.rb @@ -0,0 +1,51 @@ +class Importers::Api::Osuny::Communication::Website::Post < Importers::Api::Osuny::Communication::Website::Base + + protected + + def import + import_params + import_blocks + import_categories + end + + def object + @object ||= website.posts.where( + university: university, + website: website, + migration_identifier: migration_identifier, + language: language + ).first_or_initialize + end + + def import_params + object.title = Importers::Cleaner.clean_string params[:title] + object.summary = Importers::Cleaner.html_to_string params[:summary] + object.published_at = params[:published_at] + object.created_at = object.published_at + object.save + end + + def import_categories + categories.each do |c| + category = find_or_create_category c + next if category.nil? || category.in?(object.categories) + object.categories << category + end + end + + def find_or_create_category(data) + if data.has_key? 'name' + website.categories.where( + university: university, + website: website, + name: data['name'], + language: language + ).first_or_create + end + end + + def categories + return [] unless params.has_key?(:categories) + @categories ||= params[:categories] + end +end \ No newline at end of file diff --git a/app/services/importers/cleaner.rb b/app/services/importers/cleaner.rb new file mode 100644 index 0000000000000000000000000000000000000000..ac6cf93f2d8dcd355cf3e7fbc41d31a2b7a3b9b2 --- /dev/null +++ b/app/services/importers/cleaner.rb @@ -0,0 +1,63 @@ +module Importers + class Cleaner + + def self.html_to_string(html) + h = html + h = Importers::Cleaner.clean_html h + h = ActionController::Base.helpers.strip_tags h + h + end + + def self.clean_string(string) + string = string.gsub(' ', ' ') + string = string.gsub('&', '&') + string = ActionView::Base.full_sanitizer.sanitize string + string = remove_control_chars string + string + end + + def self.clean_html(html) + # invalid byte sequence in UTF-8 + # https://stackoverflow.com/questions/32826781/invalid-byte-sequence-in-utf-8-when-sanitizing-wordpress-export-content + html = html.force_encoding('UTF-8').scrub + # Relaxed config : https://github.com/rgrove/sanitize/blob/main/lib/sanitize/config/relaxed.rb + # iframe attributes from MDN : https://developer.mozilla.org/fr/docs/Web/HTML/Element/iframe + fragment = Sanitize.fragment(html, Sanitize::Config.merge(Sanitize::Config::RELAXED, + attributes: Sanitize::Config::RELAXED[:attributes].merge({ + all: Sanitize::Config::RELAXED[:attributes][:all].dup - ['class', 'style'], + 'a' => Sanitize::Config::RELAXED[:attributes]['a'].dup.delete('rel'), + 'iframe' => [ + 'allow', 'allowfullscreen', 'allowpaymentrequest', 'csp', 'height', 'loading', + 'name', 'referrerpolicy', 'sandbox', 'src', 'srcdoc', 'width', 'align', + 'frameborder', 'longdesc', 'marginheight', 'marginwidth', 'scrolling' + ] + }), + elements: Set.new(Sanitize::Config::RELAXED[:elements]) - ['div', 'style'] + ['iframe'], + remove_contents: ['math', 'noembed', 'noframes', 'noscript', 'plaintext', 'script', 'style', 'svg', 'xmp'], + whitespace_elements: { + 'div' => { :before => "", :after => "" } + } + )) + fragment = Nokogiri::HTML5.fragment(fragment) + if fragment.css('h1').any? + # h1 => h2 ; h2 => h3 ; ... + (1..5).to_a.reverse.each do |i| + fragment.css("h#{i}").each { |element| element.name = "h#{i+1}" } + end + end + html = fragment.to_html(preserve_newline: true) + html = remove_control_chars html + html + end + + protected + + def self.remove_control_chars(string) + # Control chars & LSEP are invisible or hard to detect + string = string.delete("
", "
", "
", "Â’") + string = string.gsub /\u2028/, '' + string + end + + end +end \ No newline at end of file diff --git a/app/services/importers/curator.rb b/app/services/importers/curator.rb new file mode 100644 index 0000000000000000000000000000000000000000..5e2631caaf1149a031489739d8fb8056c5acc6eb --- /dev/null +++ b/app/services/importers/curator.rb @@ -0,0 +1,56 @@ +module Importers + class Curator + attr_reader :website, :user, :language, :url, :post + + def initialize(website, user, language, url) + @website = website + @user = user + @language = language + @url = url + create_post! + attach_image! unless page.image.blank? + rescue + end + + def valid? + @post.valid? + end + + protected + + def create_post! + @post = website.posts.create( + university: website.university, + title: page.title, + slug: page.title.parameterize, + author: @user.person, + published_at: Time.now, + language_id: @language.id + ) + @chapter = @post.blocks.create( + university: website.university, + template_kind: :chapter, + published: true, + position: 0 + ) + text = Importers::Cleaner.clean_html("#{page.text}<p><a href=\"#{@url}\" target=\"_blank\">Source</a></p>") + data = @chapter.data.deep_dup + data['text'] = text + @chapter.data = data + @chapter.save + end + + def attach_image! + @post.featured_image.attach( + io: URI.open(page.image), + filename: File.basename(page.image).split('?').first + ) + rescue + puts "Attach image failed" + end + + def page + @page ||= Curation::Page.new(@url) + end + end +end \ No newline at end of file diff --git a/app/services/wordpress.rb b/app/services/wordpress.rb deleted file mode 100644 index 2b0240d70944d54929ccfa45af597cc0c8cfe246..0000000000000000000000000000000000000000 --- a/app/services/wordpress.rb +++ /dev/null @@ -1,100 +0,0 @@ -class Wordpress - attr_reader :url - - def self.clean_string(string) - string = string.gsub(' ', ' ') - string = string.gsub('&', '&') - string = ActionView::Base.full_sanitizer.sanitize string - string = remove_control_chars string - string - end - - def self.clean_html(html) - # invalid byte sequence in UTF-8 - # https://stackoverflow.com/questions/32826781/invalid-byte-sequence-in-utf-8-when-sanitizing-wordpress-export-content - html = html.force_encoding('UTF-8').scrub - # Relaxed config : https://github.com/rgrove/sanitize/blob/main/lib/sanitize/config/relaxed.rb - # iframe attributes from MDN : https://developer.mozilla.org/fr/docs/Web/HTML/Element/iframe - fragment = Sanitize.fragment(html, Sanitize::Config.merge(Sanitize::Config::RELAXED, - attributes: Sanitize::Config::RELAXED[:attributes].merge({ - all: Sanitize::Config::RELAXED[:attributes][:all].dup - ['class', 'style'], - 'a' => Sanitize::Config::RELAXED[:attributes]['a'].dup.delete('rel'), - 'iframe' => [ - 'allow', 'allowfullscreen', 'allowpaymentrequest', 'csp', 'height', 'loading', - 'name', 'referrerpolicy', 'sandbox', 'src', 'srcdoc', 'width', 'align', - 'frameborder', 'longdesc', 'marginheight', 'marginwidth', 'scrolling' - ] - }), - elements: Set.new(Sanitize::Config::RELAXED[:elements]) - ['div', 'style'] + ['iframe'], - remove_contents: ['math', 'noembed', 'noframes', 'noscript', 'plaintext', 'script', 'style', 'svg', 'xmp'], - whitespace_elements: { - 'div' => { :before => "", :after => "" } - } - )) - fragment = Nokogiri::HTML5.fragment(fragment) - if fragment.css('h1').any? - # h1 => h2 ; h2 => h3 ; ... - (1..5).to_a.reverse.each do |i| - fragment.css("h#{i}").each { |element| element.name = "h#{i+1}" } - end - end - html = fragment.to_html(preserve_newline: true) - html = remove_control_chars html - html - end - - def self.remove_control_chars(string) - # Control chars & LSEP are invisible or hard to detect - string = string.delete("
", "
", "
", "Â’") - string = string.gsub /\u2028/, '' - string - end - - def initialize(url) - @url = url - end - - def authors - load "#{url}/wp-json/wp/v2/users" - end - - def categories - load "#{url}/wp-json/wp/v2/categories" - end - - def posts - load "#{url}/wp-json/wp/v2/posts" - end - - def pages - load "#{url}/wp-json/wp/v2/pages" - end - - def media - load "#{url}/wp-json/wp/v2/media" - end - - protected - - def load(url) - page = 1 - posts = [] - loop do - batch = load_paged url, page - break if batch.is_a?(Hash) || batch.empty? - posts += batch - page += 1 - end - posts - end - - def load_paged(url, page) - puts "Load #{url } on page #{page}" - load_url "#{url}?page=#{page}&per_page=100" - end - - def load_url(url) - download_service = DownloadService.download(url) - JSON.parse(download_service.response.body) - end -end diff --git a/config/routes/api.rb b/config/routes/api.rb index 8c1950d03bb111156f5d033441356bf04dde7422..61b6b6301cb94156af34801cb245cca59223ed02 100644 --- a/config/routes/api.rb +++ b/config/routes/api.rb @@ -7,6 +7,7 @@ namespace :api do get 'websites' => 'websites#index' namespace :websites do post ':website_id/posts/import' => 'posts#import' + post ':website_id/pages/import' => 'pages#import' end end namespace :server do diff --git a/db/migrate/20231018182341_add_migration_identifier_to_communication_website_pages.rb b/db/migrate/20231018182341_add_migration_identifier_to_communication_website_pages.rb new file mode 100644 index 0000000000000000000000000000000000000000..ca4c63d9de0b15a9cdc849015c952c238b1b3787 --- /dev/null +++ b/db/migrate/20231018182341_add_migration_identifier_to_communication_website_pages.rb @@ -0,0 +1,5 @@ +class AddMigrationIdentifierToCommunicationWebsitePages < ActiveRecord::Migration[7.0] + def change + add_column :communication_website_pages, :migration_identifier, :string + end +end diff --git a/db/schema.rb b/db/schema.rb index 46cc6a150e72e54a6a7bcf632e4e8b9f011f889a..659e0ae397ed6501c52e9dedbd3b6089c4283ee7 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema[7.1].define(version: 2023_10_13_090313) do +ActiveRecord::Schema[7.1].define(version: 2023_10_18_182341) do # These are extensions that must be enabled in order to support this database enable_extension "pgcrypto" enable_extension "plpgsql" @@ -381,6 +381,7 @@ ActiveRecord::Schema[7.1].define(version: 2023_10_13_090313) do t.boolean "full_width", default: false t.string "type" t.uuid "original_id" + t.string "migration_identifier" t.index ["communication_website_id"], name: "index_communication_website_pages_on_communication_website_id" t.index ["language_id"], name: "index_communication_website_pages_on_language_id" t.index ["original_id"], name: "index_communication_website_pages_on_original_id" diff --git a/test/fixtures/communication/website/pages.yml b/test/fixtures/communication/website/pages.yml index 7acaaf1b263b1c1dac961fa81511f525564335bd..f56dc72fa319e3b2114ca661a4126b33a2d1ab4d 100644 --- a/test/fixtures/communication/website/pages.yml +++ b/test/fixtures/communication/website/pages.yml @@ -11,6 +11,7 @@ # header_text :text # kind :integer # meta_description :text +# migration_identifier :string # position :integer default(0), not null # published :boolean default(FALSE) # slug :string indexed diff --git a/test/integration/wordpress_test.rb b/test/integration/wordpress_test.rb deleted file mode 100644 index 8ab63b607b573857e7b79040e48b339d81c14740..0000000000000000000000000000000000000000 --- a/test/integration/wordpress_test.rb +++ /dev/null @@ -1,67 +0,0 @@ -require "test_helper" - -class WordpressTest < ActiveSupport::TestCase - test "convert apostroph" do - assert_equal 'Ouverture du CRM pendant les vacances d’Avril', - Wordpress.clean_html('Ouverture du CRM pendant les vacances d’Avril') - end - - test "convert 3 dots" do - assert_equal 'Le CRM fait le tri dans ses collections … et vous propose une vente de livres', - Wordpress.clean_html('Le CRM fait le tri dans ses collections … et vous propose une vente de livres') - end - - test "convert double quotation marks" do - assert_equal 'Conférence Joëlle Zask : “Ecologie de la participationâ€', - Wordpress.clean_html('Conférence Joëlle Zask : “Ecologie de la participation”') - end - - test "convert h1" do - assert_equal '<h2>B.U.T. Métiers du multimédia et de l’internet</h2>', - Wordpress.clean_html('<h1>B.U.T. Métiers du multimédia et de l’internet</h1>') - end - - test "convert h2 without h1" do - assert_equal '<h2>B.U.T. Métiers du multimédia et de l’internet</h2>', - Wordpress.clean_html('<h2>B.U.T. Métiers du multimédia et de l’internet</h2>') - end - - test "convert h2 with h1" do - assert_equal '<h2>Bachelor Universitaire de Technologie</h2><h3>B.U.T. Métiers du multimédia et de l’internet</h3>', - Wordpress.clean_html('<h1>Bachelor Universitaire de Technologie</h1><h2>B.U.T. Métiers du multimédia et de l’internet</h2>') - end - - test "convert " do - assert_equal 'TRAVAILLER DEMAIN, Débat – le 10 mai à 18h30', - Wordpress.clean_html('TRAVAILLER DEMAIN, Débat – le 10 mai à 18h30') - end - - test "remove classes" do - assert_equal '<h2>→ Qu’est-ce que le B.U.T. ?</h2>', - Wordpress.clean_html('<h2 class="titre-diplome">→ Qu’est-ce que le B.U.T. ?</h2>') - end - - test "remove line_separators (LSEP)" do - # Invisible char before A, and html code - assert_equal "Au ", - Wordpress.clean_html("
Au 
") - end - - test "remove divs" do - # Quid des images ? Comment gérer le transfert vers scaleway + active storage dans le code ? - assert_equal '<figure><a href="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1.png"><img src="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1-240x300.png" alt="Le BUT, qu\'est-ce que c\'est ?" width="173" height="216" srcset="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1-240x300.png 240w, https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1.png 730w"></a></figure>', - Wordpress.clean_html('<div class="wp-block-group"><div class="wp-block-group__inner-container"><div class="wp-block-columns"><div class="wp-block-column"><div class="wp-block-image"><figure class="alignright size-medium is-resized"><a href="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1.png" rel="lightbox[14475]"><img src="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1-240x300.png" alt="Le BUT, qu\'est-ce que c\'est ?" class="wp-image-14821" width="173" height="216" srcset="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1-240x300.png 240w, https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1.png 730w"></a></figure></div></div>') - - - end - - test "convert in titles" do - assert_equal ' ', - Wordpress.clean_string(' ') - end - - test "authorize iframes" do - assert_equal "<figure><iframe loading=\"lazy\" title=\"Le Bachelor Universitaire de Technologie, qu'est-ce que c'est ? - LES IUT\" width=\"640\" height=\"360\" src=\"https://www.youtube.com/embed/5xbeKHi0txk?feature=oembed\" frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture\" allowfullscreen=\"\"></iframe></figure>", - Wordpress.clean_html('<figure class="wp-block-embed is-type-video is-provider-youtube wp-block-embed-youtube wp-embed-aspect-16-9 wp-has-aspect-ratio"><div class="wp-block-embed__wrapper"><iframe loading="lazy" title="Le Bachelor Universitaire de Technologie, qu'est-ce que c'est ? - LES IUT" width="640" height="360" src="https://www.youtube.com/embed/5xbeKHi0txk?feature=oembed" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe></div></figure>') - end -end diff --git a/test/services/importers/cleaner_test.rb b/test/services/importers/cleaner_test.rb new file mode 100644 index 0000000000000000000000000000000000000000..eb0cb64db9116e24542892a6395ccaa1e8f885d0 --- /dev/null +++ b/test/services/importers/cleaner_test.rb @@ -0,0 +1,67 @@ +require "test_helper" + +class Importers::CleanerTest < ActiveSupport::TestCase + test "convert apostroph" do + assert_equal 'Ouverture du CRM pendant les vacances d’Avril', + Importers::Cleaner.clean_html('Ouverture du CRM pendant les vacances d’Avril') + end + + test "convert 3 dots" do + assert_equal 'Le CRM fait le tri dans ses collections … et vous propose une vente de livres', + Importers::Cleaner.clean_html('Le CRM fait le tri dans ses collections … et vous propose une vente de livres') + end + + test "convert double quotation marks" do + assert_equal 'Conférence Joëlle Zask : “Ecologie de la participationâ€', + Importers::Cleaner.clean_html('Conférence Joëlle Zask : “Ecologie de la participation”') + end + + test "convert h1" do + assert_equal '<h2>B.U.T. Métiers du multimédia et de l’internet</h2>', + Importers::Cleaner.clean_html('<h1>B.U.T. Métiers du multimédia et de l’internet</h1>') + end + + test "convert h2 without h1" do + assert_equal '<h2>B.U.T. Métiers du multimédia et de l’internet</h2>', + Importers::Cleaner.clean_html('<h2>B.U.T. Métiers du multimédia et de l’internet</h2>') + end + + test "convert h2 with h1" do + assert_equal '<h2>Bachelor Universitaire de Technologie</h2><h3>B.U.T. Métiers du multimédia et de l’internet</h3>', + Importers::Cleaner.clean_html('<h1>Bachelor Universitaire de Technologie</h1><h2>B.U.T. Métiers du multimédia et de l’internet</h2>') + end + + test "convert " do + assert_equal 'TRAVAILLER DEMAIN, Débat – le 10 mai à 18h30', + Importers::Cleaner.clean_html('TRAVAILLER DEMAIN, Débat – le 10 mai à 18h30') + end + + test "remove classes" do + assert_equal '<h2>→ Qu’est-ce que le B.U.T. ?</h2>', + Importers::Cleaner.clean_html('<h2 class="titre-diplome">→ Qu’est-ce que le B.U.T. ?</h2>') + end + + test "remove line_separators (LSEP)" do + # Invisible char before A, and html code + assert_equal "Au ", + Importers::Cleaner.clean_html("
Au 
") + end + + test "remove divs" do + # Quid des images ? Comment gérer le transfert vers scaleway + active storage dans le code ? + assert_equal '<figure><a href="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1.png"><img src="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1-240x300.png" alt="Le BUT, qu\'est-ce que c\'est ?" width="173" height="216" srcset="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1-240x300.png 240w, https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1.png 730w"></a></figure>', + Importers::Cleaner.clean_html('<div class="wp-block-group"><div class="wp-block-group__inner-container"><div class="wp-block-columns"><div class="wp-block-column"><div class="wp-block-image"><figure class="alignright size-medium is-resized"><a href="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1.png" rel="lightbox[14475]"><img src="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1-240x300.png" alt="Le BUT, qu\'est-ce que c\'est ?" class="wp-image-14821" width="173" height="216" srcset="https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1-240x300.png 240w, https://www.iut.u-bordeaux-montaigne.fr/wp-content/uploads/2021/01/visuel_1.png 730w"></a></figure></div></div>') + + + end + + test "convert in titles" do + assert_equal ' ', + Importers::Cleaner.clean_string(' ') + end + + test "authorize iframes" do + assert_equal "<figure><iframe loading=\"lazy\" title=\"Le Bachelor Universitaire de Technologie, qu'est-ce que c'est ? - LES IUT\" width=\"640\" height=\"360\" src=\"https://www.youtube.com/embed/5xbeKHi0txk?feature=oembed\" frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture\" allowfullscreen=\"\"></iframe></figure>", + Importers::Cleaner.clean_html('<figure class="wp-block-embed is-type-video is-provider-youtube wp-block-embed-youtube wp-embed-aspect-16-9 wp-has-aspect-ratio"><div class="wp-block-embed__wrapper"><iframe loading="lazy" title="Le Bachelor Universitaire de Technologie, qu'est-ce que c'est ? - LES IUT" width="640" height="360" src="https://www.youtube.com/embed/5xbeKHi0txk?feature=oembed" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe></div></figure>') + end +end