require 'rbbt/util/open'
require 'rbbt/sources/organism'
require 'rbbt/tsv'
require 'rbbt/sources/ensembl'
require 'net/ftp'

module Ensembl
  
  module FTP

    SERVER = "ftp.ensembl.org"
    DOMAIN_SERVER = "ftp.ensemblgenomes.org"

    def self.ftp_name_for_domain(domain, organism, subdir='mysql')
      code, build = organism.split "/"
      build ||= "current"

      release = build == "current" ? 'current' : Ensembl.releases[build]
      name = Organism.scientific_name(organism)
      ftp = Net::FTP.new(Ensembl::FTP::DOMAIN_SERVER)
      ftp.passive = true
      ftp.login
      dir = File.join('pub', domain,  'current', subdir)
      ftp.chdir(dir)
      file = ftp.list(name.downcase.gsub(" ",'_') + "*").reject{|f|  f.split("_").length > 3 && ! f.include?("_core_") }.reject{|f| f =~ /\.gz$/}.collect{|l| l.split(" ").last}.last
      ftp.close
      [release, File.join(Ensembl::FTP::DOMAIN_SERVER, dir, file)]
    end

    def self.ftp_name_for(organism, subdir='mysql')
      if domain = Thread.current["ensembl_domain"]
        return ftp_name_for_domain(domain, organism,subdir)
      end

      code, build = organism.split "/"
      build ||= "current"

      if build.to_s == "current"
        release = 'current'
        name = Organism.scientific_name(organism)
        ftp = Net::FTP.new(Ensembl::FTP::SERVER)
        ftp.passive = true
        ftp.login
        dir = File.join('pub', "current_#{subdir}")
        ftp.chdir(dir)
        file = ftp.list(name.downcase.gsub(" ",'_') + "*").reject{|f| f.split("_").length > 3 && ! f.include?("_core_") }.collect{|l| l.split(" ").last}.last
        ftp.close
      else
        release = Ensembl.releases[build]
        name = Organism.scientific_name(organism)
        ftp = Net::FTP.new(Ensembl::FTP::SERVER)
        ftp.passive = true
        ftp.login
        dir = File.join('pub', release, subdir)
        ftp.chdir(dir)
        file = ftp.list(name.downcase.gsub(" ",'_') + "*").reject{|f| f.split("_").length > 3 && ! f.include?("_core_") }.collect{|l| l.split(" ").last}.last
        ftp.close
      end
      [release, File.join(Ensembl::FTP::SERVER, dir, file)]
    end

    def self.ftp_url_for(organism)
      release, ftp_url = ftp_name_for(organism)
      ftp_url
    end

    def self.base_url(organism)
      File.join("ftp://", ftp_url_for(organism) )
    end

    def self.url_for(organism, table, extension)
      File.join(base_url(organism), table) + ".#{extension}.gz"
    end

    def self._get_gz(url)
      begin
        CMD.cmd("wget '#{url}' -O  - | gunzip").read
      rescue
        CMD.cmd("wget '#{url}.bz2' -O  - | bunzip2 | gunzip").read
      end
    end

    def self._get_file(organism, table, extension)
      url = url_for(organism, table, extension)
      self._get_gz(url)
    end

    def self.has_table?(organism, table)
      sql_file = _get_file(organism, File.basename(base_url(organism)), 'sql')
      ! sql_file.match(/^CREATE TABLE .#{table}. \((.*?)^\)/sm).nil?
    end

    def self.fields_for(organism, table)
      sql_file = _get_file(organism, File.basename(base_url(organism)), 'sql')
      chunk = sql_file.match(/^CREATE TABLE .#{table}. \((.*?)^\)/sm)[1]
      chunk.scan(/^\s+`(.*?)`/).flatten
    end

    def self.ensembl_tsv(organism, table, key_field = nil, fields = nil, options = {})
      if key_field and fields
        all_fields = fields_for(organism, table)
        key_pos = all_fields.index key_field
        field_pos = fields.collect{|f| all_fields.index f}

        options[:key_field] = key_pos
        options[:fields]    = field_pos
      end

      tsv = TSV.open(StringIO.new(_get_file(organism, table, "txt")), options)
      tsv.key_field = key_field
      tsv.fields = fields
      tsv
    end
  end
end