From 5a839ed0e0e0c25c9ca89fb46bc1229880e3c115 Mon Sep 17 00:00:00 2001
From: Arnaud Levy <contact@arnaudlevy.com>
Date: Fri, 8 Oct 2021 19:22:17 +0200
Subject: [PATCH] refactor

---
 Gemfile                                       |  1 +
 Gemfile.lock                                  |  4 ++
 .../communication/website/imported/page.rb    |  4 +-
 .../communication/website/imported/post.rb    |  6 +-
 .../communication/website/imported/website.rb | 23 ++------
 app/services/wordpress.rb                     | 55 +++++++++++++++++++
 db/schema.rb                                  |  2 +-
 7 files changed, 72 insertions(+), 23 deletions(-)
 create mode 100644 app/services/wordpress.rb

diff --git a/Gemfile b/Gemfile
index 107a5a2aa..35fd20a2b 100644
--- a/Gemfile
+++ b/Gemfile
@@ -37,6 +37,7 @@ gem 'sassc-rails'
 gem 'jbuilder'
 gem 'kamifusen'
 gem 'bootstrap'
+gem 'sanitize'
 
 group :development, :test do
   gem 'byebug', platforms: [:mri, :mingw, :x64_mingw]
diff --git a/Gemfile.lock b/Gemfile.lock
index 2984c45f4..b65a8cb7c 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -289,6 +289,9 @@ GEM
       ffi (~> 1.12)
     ruby2_keywords (0.0.5)
     rubyzip (2.3.2)
+    sanitize (6.0.0)
+      crass (~> 1.0.2)
+      nokogiri (>= 1.12.0)
     sassc (2.4.0)
       ffi (~> 1.9)
     sassc-rails (2.1.2)
@@ -386,6 +389,7 @@ DEPENDENCIES
   rack-mini-profiler (~> 2.0)
   rails
   rails-i18n
+  sanitize
   sassc-rails
   selenium-webdriver
   sib-api-v3-sdk
diff --git a/app/models/communication/website/imported/page.rb b/app/models/communication/website/imported/page.rb
index 8400f0fb7..056f41d6b 100644
--- a/app/models/communication/website/imported/page.rb
+++ b/app/models/communication/website/imported/page.rb
@@ -51,10 +51,10 @@ class Communication::Website::Imported::Page < ApplicationRecord
       self.page.save
     end
     # TODO only if not modified since import
-    page.title = title.to_s
+    page.title = Wordpress.clean title.to_s
     # TODO add that
     # page.description = description.to_s
-    page.text = content.to_s
+    page.text = Wordpress.clean content.to_s
     page.save
   end
 end
diff --git a/app/models/communication/website/imported/post.rb b/app/models/communication/website/imported/post.rb
index ff80c8b51..9768e9fcc 100644
--- a/app/models/communication/website/imported/post.rb
+++ b/app/models/communication/website/imported/post.rb
@@ -53,9 +53,9 @@ class Communication::Website::Imported::Post < ApplicationRecord
       self.post.save
     end
     # TODO only if not modified since import
-    post.title = title.to_s
-    post.description = description.to_s
-    post.text = content.to_s
+    post.title = Wordpress.clean title.to_s
+    post.description = Wordpress.clean description.to_s
+    post.text = Wordpress.clean content.to_s
     post.published_at = published_at if published_at
     post.save
   end
diff --git a/app/models/communication/website/imported/website.rb b/app/models/communication/website/imported/website.rb
index ecc802bf2..e6330c1fe 100644
--- a/app/models/communication/website/imported/website.rb
+++ b/app/models/communication/website/imported/website.rb
@@ -35,9 +35,12 @@ class Communication::Website::Imported::Website < ApplicationRecord
 
   protected
 
+  def wordpress
+    @wordpress ||= Wordpress.new website.domain_url
+  end
+
   def sync_pages
-    # TODO paginate
-    load("#{website.domain_url}/wp-json/wp/v2/pages?per_page=100").each do |hash|
+    wordpress.pages.each do |hash|
       url = hash['link']
       path = URI(url).path
       # TODO id
@@ -50,10 +53,7 @@ class Communication::Website::Imported::Website < ApplicationRecord
   end
 
   def sync_posts
-    # TODO paginate
-    # Communication::Website::Imported::Post.destroy_all
-    # Communication::Website::Post.destroy_all
-    load("#{website.domain_url}/wp-json/wp/v2/posts?per_page=100").each do |hash|
+    wordpress.posts.each do |hash|
       identifier = hash['id']
       post = posts.where(university: university, identifier: identifier).first_or_create
       post.url = hash['link']
@@ -66,15 +66,4 @@ class Communication::Website::Imported::Website < ApplicationRecord
       post.save
     end
   end
-
-  def load(url)
-    uri = URI(url)
-    http = Net::HTTP.new(uri.host, uri.port)
-    http.use_ssl = true
-    # IUT Bordeaux Montaigne pb with certificate
-    http.verify_mode = OpenSSL::SSL::VERIFY_NONE
-    request = Net::HTTP::Get.new(uri.request_uri)
-    response = http.request(request)
-    JSON.parse(response.body)
-  end
 end
diff --git a/app/services/wordpress.rb b/app/services/wordpress.rb
new file mode 100644
index 000000000..0276618f2
--- /dev/null
+++ b/app/services/wordpress.rb
@@ -0,0 +1,55 @@
+class Wordpress
+  attr_reader :domain
+
+  # Test in console with:
+  # reload! && Communication::Website::Imported::Post.first.save && Communication::Website::Imported::Post.first.post.text
+  # R&D:
+  # https://github.com/rails/rails-html-sanitizer
+  # https://github.com/gjtorikian/html-pipeline
+  # https://github.com/rgrove/sanitize
+  def self.clean(html)
+    Sanitize.fragment html, Sanitize::Config::RELAXED
+  end
+
+  def initialize(domain)
+    @domain = domain
+  end
+
+  def posts
+    load "#{domain}/wp-json/wp/v2/posts"
+  end
+
+  def pages
+    load "#{domain}/wp-json/wp/v2/pages"
+  end
+
+  protected
+
+  def load(url)
+    page = 1
+    posts = []
+    loop do
+      batch = load_paged url, page
+      puts "Load page #{page}"
+      break if batch.is_a?(Hash) || batch.empty?
+      posts += batch
+      page += 1
+    end
+    posts
+  end
+
+  def load_paged(url, page)
+    load_url "#{url}?page=#{page}&per_page=100"
+  end
+
+  def load_url(url)
+    uri = URI(url)
+    http = Net::HTTP.new(uri.host, uri.port)
+    http.use_ssl = true
+    # IUT Bordeaux Montaigne pb with certificate
+    http.verify_mode = OpenSSL::SSL::VERIFY_NONE
+    request = Net::HTTP::Get.new(uri.request_uri)
+    response = http.request(request)
+    JSON.parse(response.body)
+  end
+end
diff --git a/db/schema.rb b/db/schema.rb
index 1f77195c1..6c4146d71 100644
--- a/db/schema.rb
+++ b/db/schema.rb
@@ -213,7 +213,7 @@ ActiveRecord::Schema.define(version: 2021_10_08_152623) do
     t.uuid "research_journal_id", null: false
     t.uuid "research_journal_volume_id"
     t.datetime "created_at", precision: 6, null: false
-    t.datetime "updated_at", precision: 6, null: false
+    t.date "updated_at", null: false
     t.uuid "updated_by_id"
     t.text "abstract"
     t.text "references"
-- 
GitLab