require 'net/http' require 'uri' puts "using http-ncbi-dbs-dgs.rake" # Downloads tarball at the given URL if a local copy does not exist, or if the # local copy is older than at the given URL, or if the local copy is corrupt. def download(url, last_to_do) file = File.basename(url) # # Resume an interrupted download or fetch the file for the first time. If # # the file on the server is newer, then it is downloaded from start. sh "wget -Nc --no-verbose #{url}" # If the local copy is already fully retrieved, then the previous command # ignores the timestamp. So we check with the server again if the file on # the server is newer and if so download the new copy. sh "wget -N --no-verbose #{url}" sh "wget -Nc --no-verbose #{url}.md5" sh "wget -N --no-verbose #{url}.md5" # Immediately download md5 and verify the tarball. Re-download tarball if # corrupt; extract otherwise. sh "md5sum -c #{file}.md5" do |matched, _| if !matched sh "rm #{file} #{file}.md5"; download(url) # too many tar instances unzipping the same file clutter the system elsif file == last_to_do; sh "tar xfov #{file}" else # at least nr and nt tarballs have identical files .?al; unsure of others sh "tar xfov #{file} --exclude='*.?al' --exclude='taxdb*'" end end end def databases method = 'https://' host, dir = 'ftp.ncbi.nlm.nih.gov', 'blast/db' uri = URI.parse(method + host + "/" + dir + "/") response = Net::HTTP.get_response(uri) body = response.body.split array_of_files = [] body.each do |line| # regex takes the raw http response, matches lines such as: # href="tsa_nt.06.tar.gz.md5">tsa_nt.06.tar.gz # Returns: # tsa_nt.06.tar.gz filenames_and_newlines = line[/(^href=".*">)(.*tar.gz|.*md5)(<\/a>)$/, 2] array_of_files.append(filenames_and_newlines) unless filenames_and_newlines.nil? end # append the full path to file for downstream wget array_of_files.map! { |string| "".concat("/blast/db/", string ) } array_of_files. map { |file| File.join(host, file) }. select { |file| file.match(/\.tar\.gz$/) }. group_by { |file| File.basename(file).split('.')[0] } end # Create user-facing task for each database to drive the download of its # volumes in parallel. databases.each do |name, files| last = { name => files.last } multitask(name => files.map { |file| task(file) { download(file, last.values.uniq) } }) end # List name of all databases that can be downloaded if executed without # any arguments. task :default do databases puts databases.keys.push('taxdump').join(', ') end task :taxdump do download('https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz', "nil") end # Ruby being over my head, this is my quick-and-dirty way to trick it ignoring # "http" as a task rather than a specification. Happy for an expert to fix it up! task :http do puts "using http method" end