require 'rubygems' require 'open-uri' require 'fileutils' require 'nokogiri' require 'date' require 'json' require 'uri' require 'jekyll' module Jekyll module Tumblr def self.process(url, format = "html", grab_images = false, add_highlights = false, rewrite_urls = true) @grab_images = grab_images FileUtils.mkdir_p "_posts/tumblr" url += "/api/read/json/" per_page = 50 posts = [] # Two passes are required so that we can rewrite URLs. # First pass builds up an array of each post as a hash. begin current_page = (current_page || -1) + 1 feed = open(url + "?num=#{per_page}&start=#{current_page * per_page}") json = feed.readlines.join("\n")[21...-2] # Strip Tumblr's JSONP chars. blog = JSON.parse(json) puts "Page: #{current_page + 1} - Posts: #{blog["posts"].size}" posts += blog["posts"].map { |post| post_to_hash(post, format) } end until blog["posts"].size < per_page # Rewrite URLs and create redirects. posts = rewrite_urls_and_redirects posts if rewrite_urls # Second pass for writing post files. posts.each do |post| if format == "md" post[:content] = html_to_markdown post[:content] post[:content] = add_syntax_highlights post[:content] if add_highlights end File.open("_posts/tumblr/#{post[:name]}", "w") do |f| f.puts post[:header].to_yaml + "---\n" + post[:content] end end end private # Converts each type of Tumblr post to a hash with all required # data for Jekyll. def self.post_to_hash(post, format) case post['type'] when "regular" title = post["regular-title"] content = post["regular-body"] when "link" title = post["link-text"] || post["link-url"] content = "#{title}" unless post["link-description"].nil? content << "
" + post["link-description"] end when "photo" title = post["photo-caption"] max_size = post.keys.map{ |k| k.gsub("photo-url-", "").to_i }.max url = post["photo-url"] || post["photo-url-#{max_size}"] ext = "." + post[post.keys.select { |k| k =~ /^photo-url-/ && post[k].split("/").last =~ /\./ }.first].split(".").last content = "" unless post["photo-link-url"].nil? content = "#{content}" end when "audio" if !post["id3-title"].nil? title = post["id3-title"] content = post.at["audio-player"] + "
" + post["audio-caption"] else title = post["audio-caption"] content = post.at["audio-player"] end when "quote" title = post["quote-text"] content = "
#{post["quote-text"]}
" unless post["quote-source"].nil? content << "—" + post["quote-source"] end when "conversation" title = post["conversation-title"] content = "
" post["conversation"]["line"].each do |line| content << "
#{line['label']}
#{line}
" end content << "
" when "video" title = post["video-title"] content = post["video-player"] unless post["video-caption"].nil? content << "
" + post["video-caption"] end end date = Date.parse(post['date']).to_s title = Nokogiri::HTML(title).text slug = title.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '') { :name => "#{date}-#{slug}.#{format}", :header => { "layout" => "post", "title" => title, "tags" => post["tags"], }, :content => content, :url => post["url"], :slug => post["url-with-slug"], } end # Create a Hash of old urls => new urls, for rewriting and # redirects, and replace urls in each post. Instantiate Jekyll # site/posts to get the correct permalink format. def self.rewrite_urls_and_redirects(posts) site = Jekyll::Site.new(Jekyll.configuration({})) dir = File.join(File.dirname(__FILE__), "..") urls = Hash[posts.map { |post| # Create an initial empty file for the post so that # we can instantiate a post object. File.open("_posts/tumblr/#{post[:name]}", "w") tumblr_url = URI.parse(post[:slug]).path jekyll_url = Jekyll::Post.new(site, dir, "", "tumblr/" + post[:name]).url redirect_dir = tumblr_url.sub(/\//, "") + "/" FileUtils.mkdir_p redirect_dir File.open(redirect_dir + "index.html", "w") do |f| f.puts "" end [tumblr_url, jekyll_url] }] posts.map { |post| urls.each do |tumblr_url, jekyll_url| post[:content].gsub!(/#{tumblr_url}/i, jekyll_url) end post } end # Uses Python's html2text to convert a post's content to # markdown. Preserve HTML tables as per the markdown docs. def self.html_to_markdown(content) preserve = ["table", "tr", "th", "td"] preserve.each do |tag| content.gsub!(/<#{tag}/i, "$$" + tag) content.gsub!(/<\/#{tag}/i, "||" + tag) end content = %x[echo '#{content.gsub("'", "''")}' | html2text] preserve.each do |tag| content.gsub!("$$" + tag, "<" + tag) content.gsub!("||" + tag, "