diff --git a/Gemfile b/Gemfile index 107a5a2aa3f5a3ab204e92354500d19dce65bf63..35fd20a2bffa2166a590b58e180a7fc64e69f252 100644 --- a/Gemfile +++ b/Gemfile @@ -37,6 +37,7 @@ gem 'sassc-rails' gem 'jbuilder' gem 'kamifusen' gem 'bootstrap' +gem 'sanitize' group :development, :test do gem 'byebug', platforms: [:mri, :mingw, :x64_mingw] diff --git a/Gemfile.lock b/Gemfile.lock index 2984c45f4bec9da3667eae4290fbc83240c7f4be..b65a8cb7c5b95783c4bfe76a14579747e8cd5e4f 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -289,6 +289,9 @@ GEM ffi (~> 1.12) ruby2_keywords (0.0.5) rubyzip (2.3.2) + sanitize (6.0.0) + crass (~> 1.0.2) + nokogiri (>= 1.12.0) sassc (2.4.0) ffi (~> 1.9) sassc-rails (2.1.2) @@ -386,6 +389,7 @@ DEPENDENCIES rack-mini-profiler (~> 2.0) rails rails-i18n + sanitize sassc-rails selenium-webdriver sib-api-v3-sdk diff --git a/app/models/communication/website/imported/page.rb b/app/models/communication/website/imported/page.rb index 8400f0fb78f7515c48aa1398450b8e872dbaea75..056f41d6b6150fd070577d67d6540bf441a35ebe 100644 --- a/app/models/communication/website/imported/page.rb +++ b/app/models/communication/website/imported/page.rb @@ -51,10 +51,10 @@ class Communication::Website::Imported::Page < ApplicationRecord self.page.save end # TODO only if not modified since import - page.title = title.to_s + page.title = Wordpress.clean title.to_s # TODO add that # page.description = description.to_s - page.text = content.to_s + page.text = Wordpress.clean content.to_s page.save end end diff --git a/app/models/communication/website/imported/post.rb b/app/models/communication/website/imported/post.rb index ff80c8b51e5185a7556e1b7fbd3db3ab4e0eca74..9768e9fcce0509695524954d3a56c32a536c86ac 100644 --- a/app/models/communication/website/imported/post.rb +++ b/app/models/communication/website/imported/post.rb @@ -53,9 +53,9 @@ class Communication::Website::Imported::Post < ApplicationRecord self.post.save end # TODO only if not modified since import - post.title = title.to_s - post.description = description.to_s - post.text = content.to_s + post.title = Wordpress.clean title.to_s + post.description = Wordpress.clean description.to_s + post.text = Wordpress.clean content.to_s post.published_at = published_at if published_at post.save end diff --git a/app/models/communication/website/imported/website.rb b/app/models/communication/website/imported/website.rb index ecc802bf2ae3a3dea89d7e838629b9ab5576545c..e6330c1fe766a888205d3d9cf7eeddd25274ec0f 100644 --- a/app/models/communication/website/imported/website.rb +++ b/app/models/communication/website/imported/website.rb @@ -35,9 +35,12 @@ class Communication::Website::Imported::Website < ApplicationRecord protected + def wordpress + @wordpress ||= Wordpress.new website.domain_url + end + def sync_pages - # TODO paginate - load("#{website.domain_url}/wp-json/wp/v2/pages?per_page=100").each do |hash| + wordpress.pages.each do |hash| url = hash['link'] path = URI(url).path # TODO id @@ -50,10 +53,7 @@ class Communication::Website::Imported::Website < ApplicationRecord end def sync_posts - # TODO paginate - # Communication::Website::Imported::Post.destroy_all - # Communication::Website::Post.destroy_all - load("#{website.domain_url}/wp-json/wp/v2/posts?per_page=100").each do |hash| + wordpress.posts.each do |hash| identifier = hash['id'] post = posts.where(university: university, identifier: identifier).first_or_create post.url = hash['link'] @@ -66,15 +66,4 @@ class Communication::Website::Imported::Website < ApplicationRecord post.save end end - - def load(url) - uri = URI(url) - http = Net::HTTP.new(uri.host, uri.port) - http.use_ssl = true - # IUT Bordeaux Montaigne pb with certificate - http.verify_mode = OpenSSL::SSL::VERIFY_NONE - request = Net::HTTP::Get.new(uri.request_uri) - response = http.request(request) - JSON.parse(response.body) - end end diff --git a/app/services/wordpress.rb b/app/services/wordpress.rb new file mode 100644 index 0000000000000000000000000000000000000000..0276618f24f2748c47cb5089a57ddcaba2fd8293 --- /dev/null +++ b/app/services/wordpress.rb @@ -0,0 +1,55 @@ +class Wordpress + attr_reader :domain + + # Test in console with: + # reload! && Communication::Website::Imported::Post.first.save && Communication::Website::Imported::Post.first.post.text + # R&D: + # https://github.com/rails/rails-html-sanitizer + # https://github.com/gjtorikian/html-pipeline + # https://github.com/rgrove/sanitize + def self.clean(html) + Sanitize.fragment html, Sanitize::Config::RELAXED + end + + def initialize(domain) + @domain = domain + end + + def posts + load "#{domain}/wp-json/wp/v2/posts" + end + + def pages + load "#{domain}/wp-json/wp/v2/pages" + end + + protected + + def load(url) + page = 1 + posts = [] + loop do + batch = load_paged url, page + puts "Load page #{page}" + break if batch.is_a?(Hash) || batch.empty? + posts += batch + page += 1 + end + posts + end + + def load_paged(url, page) + load_url "#{url}?page=#{page}&per_page=100" + end + + def load_url(url) + uri = URI(url) + http = Net::HTTP.new(uri.host, uri.port) + http.use_ssl = true + # IUT Bordeaux Montaigne pb with certificate + http.verify_mode = OpenSSL::SSL::VERIFY_NONE + request = Net::HTTP::Get.new(uri.request_uri) + response = http.request(request) + JSON.parse(response.body) + end +end diff --git a/db/schema.rb b/db/schema.rb index 1f77195c1707a9db98616fdc9183015813e2a96b..6c4146d71072c9303f6e5d50a7f310d81ea8a4cc 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -213,7 +213,7 @@ ActiveRecord::Schema.define(version: 2021_10_08_152623) do t.uuid "research_journal_id", null: false t.uuid "research_journal_volume_id" t.datetime "created_at", precision: 6, null: false - t.datetime "updated_at", precision: 6, null: false + t.date "updated_at", null: false t.uuid "updated_by_id" t.text "abstract" t.text "references"