module JekyllImport module Importers class Tumblr < Importer def self.require_deps JekyllImport.require_with_fallback(%w[ rubygems fileutils open-uri nokogiri json uri time jekyll ]) end def self.specify_options(c) c.option 'url', '--url URL', 'Tumblr URL' c.option 'format', '--format FORMAT', 'Output format (default: "html")' c.option 'grab_images', '--grab_images', 'Whether to grab images (default: false)' c.option 'add_highlights', '--add_highlights', 'Whether to add highlights (default: false)' c.option 'rewrite_urls', '--rewrite_urls', 'Whether to rewrite URLs (default: false)' end def self.process(options) url = options.fetch('url') format = options.fetch('format', "html") grab_images = options.fetch('grab_images', false) add_highlights = options.fetch('add_highlights', false) rewrite_urls = options.fetch('rewrite_urls', false) @grab_images = grab_images FileUtils.mkdir_p "_posts/tumblr" url += "/api/read/json/" per_page = 50 posts = [] # Two passes are required so that we can rewrite URLs. # First pass builds up an array of each post as a hash. begin current_page = (current_page || -1) + 1 feed_url = url + "?num=#{per_page}&start=#{current_page * per_page}" puts "Fetching #{feed_url}" feed = open(feed_url) json = feed.readlines.join("\n")[21...-2] # Strip Tumblr's JSONP chars. blog = JSON.parse(json) puts "Page: #{current_page + 1} - Posts: #{blog["posts"].size}" batch = blog["posts"].map { |post| post_to_hash(post, format) } # If we're rewriting, save the posts for later. Otherwise, go ahead and # dump these to disk now if rewrite_urls posts += batch else batch.each {|post| write_post(post, format == "md", add_highlights)} end end until blog["posts"].size < per_page # Rewrite URLs, create redirects and write out out posts if necessary if rewrite_urls posts = rewrite_urls_and_redirects posts posts.each {|post| write_post(post, format == "md", add_highlights)} end end private # Writes a post out to disk def self.write_post(post, use_markdown, add_highlights) content = post[:content] if content if use_markdown content = html_to_markdown content content = add_syntax_highlights content if add_highlights end File.open("_posts/tumblr/#{post[:name]}", "w") do |f| f.puts post[:header].to_yaml + "---\n" + content end end end # Converts each type of Tumblr post to a hash with all required # data for Jekyll. def self.post_to_hash(post, format) case post['type'] when "regular" title = post["regular-title"] content = post["regular-body"] when "link" title = post["link-text"] || post["link-url"] content = "#{title}" unless post["link-description"].nil? content << "
" + post["link-description"] end when "photo" title = post["photo-caption"] content = if post["photo-link-url"].nil? "#{content}" else fetch_photo post end when "audio" if !post["id3-title"].nil? title = post["id3-title"] content = post["audio-player"] + "
" + post["audio-caption"] else title = post["audio-caption"] content = post["audio-player"] end when "quote" title = post["quote-text"] content = "

#{post["quote-text"]}

" unless post["quote-source"].nil? content << "—" + post["quote-source"] end when "conversation" title = post["conversation-title"] content = "

" when "video" title = post["video-title"] content = post["video-player"] unless post["video-caption"].nil? content << "
" + post["video-caption"] end when "answer" title = post["question"] content = post["answer"] end date = Date.parse(post['date']).to_s title = Nokogiri::HTML(title).text slug = if post["slug"] && post["slug"].strip != "" post["slug"] else slug = title.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '') slug.length > 200 ? slug.slice(0..200) : slug end { :name => "#{date}-#{slug}.#{format}", :header => { "layout" => "post", "title" => title, "date" => Time.parse(post['date']).xmlschema, "tags" => (post["tags"] or []), "tumblr_url" => post["url-with-slug"] }, :content => content, :url => post["url"], :slug => post["url-with-slug"], } end # Attempts to fetch the largest version of a photo available for a post. # If that file fails, it tries the next smaller size until all available # photo URLs are exhausted. If they all fail, the import is aborted. def self.fetch_photo(post) sizes = post.keys.map {|k| k.gsub("photo-url-", "").to_i} sizes.sort! {|a,b| b <=> a} ext_key, ext_val = post.find do |k,v| k =~ /^photo-url-/ && v.split("/").last =~ /\./ end ext = "." + ext_val.split(".").last sizes.each do |size| url = post["photo-url"] || post["photo-url-#{size}"] next if url.nil? begin return "

" rescue OpenURI::HTTPError => err puts "Failed to grab photo" end end abort "Failed to fetch photo for post #{post['url']}" end # Create a Hash of old urls => new urls, for rewriting and # redirects, and replace urls in each post. Instantiate Jekyll # site/posts to get the correct permalink format. def self.rewrite_urls_and_redirects(posts) site = Jekyll::Site.new(Jekyll.configuration({})) urls = Hash[posts.map { |post| # Create an initial empty file for the post so that # we can instantiate a post object. File.open("_posts/tumblr/#{post[:name]}", "w") tumblr_url = URI.parse(post[:slug]).path jekyll_url = Jekyll::Post.new(site, Dir.pwd, "", "tumblr/" + post[:name]).url redirect_dir = tumblr_url.sub(/\//, "") + "/" FileUtils.mkdir_p redirect_dir File.open(redirect_dir + "index.html", "w") do |f| f.puts "" end [tumblr_url, jekyll_url] }] posts.map { |post| urls.each do |tumblr_url, jekyll_url| post[:content].gsub!(/#{tumblr_url}/i, jekyll_url) end post } end # Convert preserving HTML tables as per the markdown docs. def self.html_to_markdown(content) preserve = ["table", "tr", "th", "td"] preserve.each do |tag| content.gsub!(/<#{tag}/i, "$$" + tag) content.gsub!(/<\/#{tag}/i, "||" + tag) end content = Nokogiri::HTML(content.gsub("'", "''")).text preserve.each do |tag| content.gsub!("$$" + tag, "<" + tag) content.gsub!("||" + tag, "