# frozen_string_literal: false
module JekyllImport
module Importers
class WordPress < Importer
def self.require_deps
JekyllImport.require_with_fallback(%w(
rubygems
sequel
fileutils
safe_yaml
unidecode
))
end
def self.specify_options(c)
c.option "dbname", "--dbname DB", "Database name (default: '')"
c.option "socket", "--socket SOCKET", "Database socket (default: '')"
c.option "user", "--user USER", "Database user name (default: '')"
c.option "password", "--password PW", "Database user's password (default: '')"
c.option "host", "--host HOST", "Database host name (default: 'localhost')"
c.option "port", "--port PORT", "Database port number (default: '')"
c.option "table_prefix", "--table_prefix PREFIX", "Table prefix name (default: 'wp_')"
c.option "site_prefix", "--site_prefix PREFIX", "Site prefix name (default: '')"
c.option "clean_entities", "--clean_entities", "Whether to clean entities (default: true)"
c.option "comments", "--comments", "Whether to import comments (default: true)"
c.option "categories", "--categories", "Whether to import categories (default: true)"
c.option "tags", "--tags", "Whether to import tags (default: true)"
c.option "more_excerpt", "--more_excerpt", "Whether to use more excerpt (default: true)"
c.option "more_anchor", "--more_anchor", "Whether to use more anchor (default: true)"
c.option "status", "--status STATUS,STATUS2", Array,
"Array of allowed statuses (default: ['publish'], other options: 'draft', 'private', 'revision')"
end
# Main migrator function. Call this to perform the migration.
#
# dbname:: The name of the database
# user:: The database user name
# pass:: The database user's password
# host:: The address of the MySQL database host. Default: 'localhost'
# port:: The port number of the MySQL database. Default: '3306'
# socket:: The database socket's path
# options:: A hash table of configuration options.
#
# Supported options are:
#
# :table_prefix:: Prefix of database tables used by WordPress.
# Default: 'wp_'
# :site_prefix:: Prefix of database tables used by WordPress
# Multisite, eg: 2_.
# Default: ''
# :clean_entities:: If true, convert non-ASCII characters to HTML
# entities in the posts, comments, titles, and
# names. Requires the 'htmlentities' gem to
# work. Default: true.
# :comments:: If true, migrate post comments too. Comments
# are saved in the post's YAML front matter.
# Default: true.
# :categories:: If true, save the post's categories in its
# YAML front matter. Default: true.
# :tags:: If true, save the post's tags in its
# YAML front matter. Default: true.
# :more_excerpt:: If true, when a post has no excerpt but
# does have a tag, use the
# preceding post content as the excerpt.
# Default: true.
# :more_anchor:: If true, convert a tag into
# two HTML anchors with ids "more" and
# "more-NNN" (where NNN is the post number).
# Default: true.
# :extension:: Set the post extension. Default: "html"
# :status:: Array of allowed post statuses. Only
# posts with matching status will be migrated.
# Known statuses are :publish, :draft, :private,
# and :revision. If this is nil or an empty
# array, all posts are migrated regardless of
# status. Default: [:publish].
#
def self.process(opts)
options = {
:user => opts.fetch("user", ""),
:pass => opts.fetch("password", ""),
:host => opts.fetch("host", "localhost"),
:port => opts.fetch("port", "3306"),
:socket => opts.fetch("socket", nil),
:dbname => opts.fetch("dbname", ""),
:table_prefix => opts.fetch("table_prefix", "wp_"),
:site_prefix => opts.fetch("site_prefix", nil),
:clean_entities => opts.fetch("clean_entities", true),
:comments => opts.fetch("comments", true),
:categories => opts.fetch("categories", true),
:tags => opts.fetch("tags", true),
:more_excerpt => opts.fetch("more_excerpt", true),
:more_anchor => opts.fetch("more_anchor", true),
:extension => opts.fetch("extension", "html"),
:status => opts.fetch("status", ["publish"]).map(&:to_sym), # :draft, :private, :revision
}
if options[:clean_entities]
begin
require "htmlentities"
rescue LoadError
warn "Could not require 'htmlentities', so the " \
":clean_entities option is now disabled."
options[:clean_entities] = false
end
end
FileUtils.mkdir_p("_posts")
FileUtils.mkdir_p("_drafts") if options[:status].include? :draft
db = Sequel.mysql2(options[:dbname],
:user => options[:user],
:password => options[:pass],
:socket => options[:socket],
:host => options[:host],
:port => options[:port],
:encoding => "utf8")
px = options[:table_prefix]
sx = options[:site_prefix]
page_name_list = {}
page_name_query = "
SELECT
posts.ID AS `id`,
posts.post_title AS `title`,
posts.post_name AS `slug`,
posts.post_parent AS `parent`
FROM #{px}#{sx}posts AS `posts`
WHERE posts.post_type = 'page'"
db[page_name_query].each do |page|
page[:slug] = sluggify(page[:title]) if page.fetch(:slug, "").empty?
page_name_list[ page[:id] ] = {
:slug => page[:slug],
:parent => page[:parent],
}
end
posts_query = "
SELECT
posts.ID AS `id`,
posts.guid AS `guid`,
posts.post_type AS `type`,
posts.post_status AS `status`,
posts.post_title AS `title`,
posts.post_name AS `slug`,
posts.post_date AS `date`,
posts.post_date_gmt AS `date_gmt`,
posts.post_content AS `content`,
posts.post_excerpt AS `excerpt`,
posts.comment_count AS `comment_count`,
users.display_name AS `author`,
users.user_login AS `author_login`,
users.user_email AS `author_email`,
users.user_url AS `author_url`
FROM #{px}#{sx}posts AS `posts`
LEFT JOIN #{px}#{sx}users AS `users`
ON posts.post_author = users.ID"
if options[:status] && !options[:status].empty?
status = options[:status][0]
posts_query << "
WHERE posts.post_status = '#{status}'"
options[:status][1..-1].each do |post_status|
posts_query << " OR
posts.post_status = '#{post_status}'"
end
end
db[posts_query].each do |post|
process_post(post, db, options, page_name_list)
end
end
def self.process_post(post, db, options, page_name_list)
px = options[:table_prefix]
sx = options[:site_prefix]
extension = options[:extension]
title = post[:title]
title = clean_entities(title) if options[:clean_entities]
slug = post[:slug]
slug = sluggify(title) if !slug || slug.empty?
date = post[:date] || Time.now
name = format("%02d-%02d-%02d-%s.%s", date.year, date.month, date.day, slug, extension)
content = post[:content].to_s
content = clean_entities(content) if options[:clean_entities]
excerpt = post[:excerpt].to_s
more_index = content.index(%r//)
more_anchor = nil
if more_index
if options[:more_excerpt] &&
(post[:excerpt].nil? || post[:excerpt].empty?)
excerpt = content[0...more_index]
end
if options[:more_anchor]
more_anchor = "more"
content.sub!(%r//,
"" \
"")
end
end
categories = []
tags = []
if options[:categories] || options[:tags]
cquery =
"SELECT
terms.name AS `name`,
ttax.taxonomy AS `type`
FROM
#{px}#{sx}terms AS `terms`,
#{px}#{sx}term_relationships AS `trels`,
#{px}#{sx}term_taxonomy AS `ttax`
WHERE
trels.object_id = '#{post[:id]}' AND
trels.term_taxonomy_id = ttax.term_taxonomy_id AND
terms.term_id = ttax.term_id"
db[cquery].each do |term|
if options[:categories] && term[:type] == "category"
categories << if options[:clean_entities]
clean_entities(term[:name])
else
term[:name]
end
elsif options[:tags] && term[:type] == "post_tag"
tags << if options[:clean_entities]
clean_entities(term[:name])
else
term[:name]
end
end
end
end
comments = []
if options[:comments] && post[:comment_count].to_i.positive?
cquery =
"SELECT
comment_ID AS `id`,
comment_author AS `author`,
comment_author_email AS `author_email`,
comment_author_url AS `author_url`,
comment_date AS `date`,
comment_date_gmt AS `date_gmt`,
comment_content AS `content`
FROM #{px}#{sx}comments
WHERE
comment_post_ID = '#{post[:id]}' AND
comment_approved != 'spam'"
db[cquery].each do |comment|
comcontent = comment[:content].to_s
comcontent.force_encoding("UTF-8") if comcontent.respond_to?(:force_encoding)
comcontent = clean_entities(comcontent) if options[:clean_entities]
comauthor = comment[:author].to_s
comauthor = clean_entities(comauthor) if options[:clean_entities]
comments << {
"id" => comment[:id].to_i,
"author" => comauthor,
"author_email" => comment[:author_email].to_s,
"author_url" => comment[:author_url].to_s,
"date" => comment[:date].to_s,
"date_gmt" => comment[:date_gmt].to_s,
"content" => comcontent,
}
end
comments.sort! { |a, b| a["id"] <=> b["id"] }
end
# Get the relevant fields as a hash, delete empty fields and
# convert to YAML for the header.
data = {
"layout" => post[:type].to_s,
"status" => post[:status].to_s,
"published" => post[:status].to_s == "draft" ? nil : (post[:status].to_s == "publish"),
"title" => title.to_s,
"author" => {
"display_name" => post[:author].to_s,
"login" => post[:author_login].to_s,
"email" => post[:author_email].to_s,
"url" => post[:author_url].to_s,
},
"author_login" => post[:author_login].to_s,
"author_email" => post[:author_email].to_s,
"author_url" => post[:author_url].to_s,
"excerpt" => excerpt,
"more_anchor" => more_anchor,
"wordpress_id" => post[:id],
"wordpress_url" => post[:guid].to_s,
"date" => date.to_s,
"date_gmt" => post[:date_gmt].to_s,
"categories" => options[:categories] ? categories : nil,
"tags" => options[:tags] ? tags : nil,
"comments" => options[:comments] ? comments : nil,
}.delete_if { |_k, v| v.nil? || v == "" }.to_yaml
if post[:type] == "page"
filename = page_path(post[:id], page_name_list) + "index.#{extension}"
FileUtils.mkdir_p(File.dirname(filename))
elsif post[:status] == "draft"
filename = "_drafts/#{slug}.md"
else
filename = "_posts/#{name}"
end
# Write out the data and content to file
File.open(filename, "w") do |f|
f.puts data
f.puts "---"
f.puts Util.wpautop(content)
end
end
def self.clean_entities(text)
text.force_encoding("UTF-8") if text.respond_to?(:force_encoding)
text = HTMLEntities.new.encode(text, :named)
# We don't want to convert these, it would break all
# HTML tags in the post and comments.
text.gsub!("&", "&")
text.gsub!("<", "<")
text.gsub!(">", ">")
text.gsub!(""", '"')
text.gsub!("'", "'")
text.gsub!("/", "/")
text
end
def self.sluggify(title)
title.to_ascii.downcase.gsub(%r![^0-9A-Za-z]+!, " ").strip.tr(" ", "-")
end
def self.page_path(page_id, page_name_list)
if page_name_list.key?(page_id)
[
page_path(page_name_list[page_id][:parent], page_name_list),
page_name_list[page_id][:slug],
"/",
].join("")
else
""
end
end
end
end
end