lib/jekyll-import/importers/tumblr.rb in jekyll-import-0.14.0 vs lib/jekyll-import/importers/tumblr.rb in jekyll-import-0.15.0

- old
+ new

@@ -1,5 +1,7 @@ +# frozen_string_literal: true + module JekyllImport module Importers class Tumblr < Importer def self.require_deps JekyllImport.require_with_fallback(%w( @@ -13,15 +15,15 @@ jekyll )) end def self.specify_options(c) - c.option "url", "--url URL", "Tumblr URL" - c.option "format", "--format FORMAT", 'Output format (default: "html")' - c.option "grab_images", "--grab_images", "Whether to grab images (default: false)" + c.option "url", "--url URL", "Tumblr URL" + c.option "format", "--format FORMAT", 'Output format (default: "html")' + c.option "grab_images", "--grab_images", "Whether to grab images (default: false)" c.option "add_highlights", "--add_highlights", "Whether to add highlights (default: false)" - c.option "rewrite_urls", "--rewrite_urls", "Whether to rewrite URLs (default: false)" + c.option "rewrite_urls", "--rewrite_urls", "Whether to rewrite URLs (default: false)" end def self.process(options) url = options.fetch("url") format = options.fetch("format", "html") @@ -32,24 +34,27 @@ @grab_images = grab_images FileUtils.mkdir_p "_posts/tumblr" url += "/api/read/json/" per_page = 50 posts = [] + # Two passes are required so that we can rewrite URLs. # First pass builds up an array of each post as a hash. begin current_page = (current_page || -1) + 1 - feed_url = url + "?num=#{per_page}&start=#{current_page * per_page}" - puts "Fetching #{feed_url}" - feed = open(feed_url) + feed_url = "#{url}?num=#{per_page}&start=#{current_page * per_page}" + Jekyll.logger.info "Fetching #{feed_url}" + + feed = URI.parse(feed_url).open contents = feed.readlines.join("\n") - blog = extract_json(contents) - puts "Page: #{current_page + 1} - Posts: #{blog["posts"].size}" + blog = extract_json(contents) + Jekyll.logger.info "Page: #{current_page + 1} - Posts: #{blog["posts"].size}" + batch = blog["posts"].map { |post| post_to_hash(post, format) } - # If we're rewriting, save the posts for later. Otherwise, go ahead and - # dump these to disk now + # If we're rewriting, save the posts for later. Otherwise, go ahead and dump these to + # disk now if rewrite_urls posts += batch else batch.each { |post| write_post(post, format == "md", add_highlights) } end @@ -60,112 +65,103 @@ posts = rewrite_urls_and_redirects posts posts.each { |post| write_post(post, format == "md", add_highlights) } end end - private class << self def extract_json(contents) beginning = contents.index("{") - ending = contents.rindex("}") + 1 - json = contents[beginning...ending] # Strip Tumblr's JSONP chars. - JSON.parse(json) + ending = contents.rindex("}") + 1 + json_data = contents[beginning...ending] # Strip Tumblr's JSONP chars. + JSON.parse(json_data) end # Writes a post out to disk def write_post(post, use_markdown, add_highlights) content = post[:content] + return unless content - if content - if use_markdown - content = html_to_markdown content - if add_highlights - tumblr_url = URI.parse(post[:slug]).path - redirect_dir = tumblr_url.sub(%r!\/!, "") + "/" - FileUtils.mkdir_p redirect_dir - content = add_syntax_highlights(content, redirect_dir) - end + if use_markdown + content = html_to_markdown content + if add_highlights + tumblr_url = URI.parse(post[:slug]).path + redirect_dir = tumblr_url.sub(%r!\/!, "") + "/" + FileUtils.mkdir_p redirect_dir + content = add_syntax_highlights(content, redirect_dir) end + end - File.open("_posts/tumblr/#{post[:name]}", "w") do |f| - f.puts post[:header].to_yaml + "---\n" + content - end + File.open("_posts/tumblr/#{post[:name]}", "w") do |f| + f.puts post[:header].to_yaml + "---\n" + content end end # Converts each type of Tumblr post to a hash with all required # data for Jekyll. def post_to_hash(post, format) case post["type"] when "regular" - title = post["regular-title"] - content = post["regular-body"] + title, content = post.values_at("regular-title", "regular-body") when "link" - title = post["link-text"] || post["link-url"] + title = post["link-text"] || post["link-url"] content = "<a href=\"#{post["link-url"]}\">#{title}</a>" - unless post["link-description"].nil? - content << "<br/>" + post["link-description"] - end + content << "<br/>#{post["link-description"]}" unless post["link-description"].nil? when "photo" title = post["slug"].tr("-", " ") if post["photos"].size > 1 - content = "" + content = +"" post["photos"].each do |post_photo| photo = fetch_photo post_photo - content << photo + "<br/>" + content << "#{photo}<br/>" content << post_photo["caption"] end else content = fetch_photo post end - content << "<br/>" + post["photo-caption"] + content << "<br/>#{post["photo-caption"]}" when "audio" if !post["id3-title"].nil? - title = post["id3-title"] - content = post["audio-player"] + "<br/>" + post["audio-caption"] + title, content = post.values_at("id3-title", "audio-player") + content << "<br/>#{post["audio-caption"]}" else - title = post["audio-caption"] - content = post["audio-player"] + title, content = post.values_at("audio-caption", "audio-player") end when "quote" - title = post["quote-text"] + title = post["quote-text"] content = "<blockquote>#{post["quote-text"]}</blockquote>" - unless post["quote-source"].nil? - content << "&#8212;" + post["quote-source"] - end + content << "&#8212;#{post["quote-source"]}" unless post["quote-source"].nil? when "conversation" - title = post["conversation-title"] + title = post["conversation-title"] content = "<section><dialog>" post["conversation"].each do |line| content << "<dt>#{line["label"]}</dt><dd>#{line["phrase"]}</dd>" end content << "</dialog></section>" when "video" - title = post["video-title"] - content = post["video-player"] + title, content = post.values_at("video-title", "video-player") unless post["video-caption"].nil? if content - content << "<br/>" + post["video-caption"] + content << "<br/>#{post["video-caption"]}" else content = post["video-caption"] end end when "answer" - title = post["question"] - content = post["answer"] + title, content = post.values_at("question", "answer") end - date = Date.parse(post["date"]).to_s + + date = Date.parse(post["date"]).to_s title = Nokogiri::HTML(title).text title = "no title" if title.empty? - slug = if post["slug"] && post["slug"].strip != "" - post["slug"] - elsif title && title.downcase.gsub(%r![^a-z0-9\-]!, "") != "" && title != "no title" - slug = title.downcase.strip.tr(" ", "-").gsub(%r![^a-z0-9\-]!, "") - slug.length > 200 ? slug.slice(0..200) : slug - else - post["id"] - end + slug = if post["slug"] && post["slug"].strip != "" + post["slug"] + elsif title && title.downcase.gsub(%r![^a-z0-9\-]!, "") != "" && title != "no title" + slug = title.downcase.strip.tr(" ", "-").gsub(%r![^a-z0-9\-]!, "") + slug.length > 200 ? slug.slice(0..200) : slug + else + post["id"] + end { :name => "#{date}-#{slug}.#{format}", :header => { "layout" => "post", "title" => title, @@ -178,12 +174,12 @@ :slug => post["url-with-slug"], } end # Attempts to fetch the largest version of a photo available for a post. - # If that file fails, it tries the next smaller size until all available - # photo URLs are exhausted. If they all fail, the import is aborted. + # If that file fails, it tries the next smaller size until all available photo URLs are + # exhausted. If they all fail, the import is aborted. def fetch_photo(post) sizes = post.keys.map { |k| k.gsub("photo-url-", "").to_i } sizes.sort! { |a, b| b <=> a } _ext_key, ext_val = post.find do |k, v| @@ -192,34 +188,34 @@ ext = "." + ext_val.split(".").last sizes.each do |size| url = post["photo-url"] || post["photo-url-#{size}"] next if url.nil? + begin - return "<img src=\"#{save_photo(url, ext)}\"/>" + return +"<img src=\"#{save_photo(url, ext)}\"/>" rescue OpenURI::HTTPError - puts "Failed to grab photo" + Jekyll.logger.warn "Failed to grab photo" end end abort "Failed to fetch photo for post #{post["url"]}" end - # Create a Hash of old urls => new urls, for rewriting and - # redirects, and replace urls in each post. Instantiate Jekyll - # site/posts to get the correct permalink format. + # Create a Hash of old urls => new urls, for rewriting and redirects, and replace urls in + # each post. Instantiate Jekyll site/posts to get the correct permalink format. def rewrite_urls_and_redirects(posts) site = Jekyll::Site.new(Jekyll.configuration({})) urls = Hash[posts.map do |post| - # Create an initial empty file for the post so that - # we can instantiate a post object. - File.write("_posts/tumblr/#{post[:name]}", "") + # Create an initial empty file for the post so that we can instantiate a post object. + relative_path = "_posts/tumblr/#{post[:name]}" + File.write(relative_path, "") tumblr_url = URI.parse(URI.encode(post[:slug])).path jekyll_url = if Jekyll.const_defined? :Post - Jekyll::Post.new(site, Dir.pwd, "", "tumblr/" + post[:name]).url + Jekyll::Post.new(site, site.source, "", "tumblr/#{post[:name]}").url else - Jekyll::Document.new(File.expand_path("_posts/tumblr/#{post[:name]}"), :site => site, :collection => site.posts).url + Jekyll::Document.new(site.in_source_dir(relative_path), :site => site, :collection => site.posts).url end redirect_dir = tumblr_url.sub(%r!\/!, "") + "/" FileUtils.mkdir_p redirect_dir File.open(redirect_dir + "index.html", "w") do |f| f.puts "<html><head><link rel=\"canonical\" href=\"" \ @@ -238,66 +234,64 @@ # Convert preserving HTML tables as per the markdown docs. def html_to_markdown(content) preserve = %w(table tr th td) preserve.each do |tag| - content.gsub!(%r!<#{tag}!i, "$$" + tag) - content.gsub!(%r!<\/#{tag}!i, "||" + tag) + content.gsub!(%r!<#{tag}!i, "$$#{tag}") + content.gsub!(%r!<\/#{tag}!i, "||#{tag}") end content = Nokogiri::HTML(content.gsub("'", "''")).text preserve.each do |tag| - content.gsub!("$$" + tag, "<" + tag) - content.gsub!("||" + tag, "</" + tag) + content.gsub!("$$#{tag}", "<#{tag}") + content.gsub!("||#{tag}", "</#{tag}") end content end - # Adds pygments highlight tags to code blocks in posts that use - # markdown format. This doesn't guess the language of the code - # block, so you should modify this to suit your own content. - # For example, my code block only contain Python and JavaScript, - # so I can assume the block is JavaScript if it contains a - # semi-colon. + # Adds pygments highlight tags to code blocks in posts that use markdown format. + # This doesn't guess the language of the code block, so you should modify this to suit your + # own content. + # For example, my code block only contain Python and JavaScript, so I can assume the block + # is JavaScript if it contains a semi-colon. def add_syntax_highlights(content, redirect_dir) - lines = content.split("\n") - block = false - indent = %r!^ ! - lang = nil - start = nil - lines.each_with_index do |line, i| - if !block && line =~ indent - block = true - lang = "python" - start = i - elsif block - lang = "javascript" if line =~ %r!;$! - block = line =~ indent && i < lines.size - 1 # Also handle EOF - unless block - lines[start] = "{% highlight #{lang} %}" - lines[i - 1] = "{% endhighlight %}" + lines = content.split("\n") + block = false + indent = %r!^ ! + lang = nil + start = nil + lines.each_with_index do |line, i| + if !block && line =~ indent + block = true + lang = "python" + start = i + elsif block + lang = "javascript" if line =~ %r!;$! + block = line =~ indent && i < lines.size - 1 # Also handle EOF + unless block + lines[start] = "{% highlight #{lang} %}" + lines[i - 1] = "{% endhighlight %}" + end + FileUtils.cp(redirect_dir + "index.html", redirect_dir + "../" + "index.html") + lines[i] = lines[i].sub(indent, "") end - FileUtils.cp(redirect_dir + "index.html", redirect_dir + "../" + "index.html") - lines[i] = lines[i].sub(indent, "") end + lines.join("\n") end - lines.join("\n") - end def save_photo(url, ext) - if @grab_images + return url unless @grab_images + path = "tumblr_files/#{url.split("/").last}" path += ext unless path =~ %r!#{ext}$! FileUtils.mkdir_p "tumblr_files" # Don't fetch if we've already cached this file unless File.size? path - puts "Fetching photo #{url}" - File.open(path, "wb") { |f| f.write(open(url).read) } + Jekyll.logger.info "Fetching photo #{url}" + File.open(path, "wb") { |f| f.write(URI.parse(url).read) } end - url = "/" + path + "/#{path}" end - url - end end end end end