lib/ensembl/variation/activerecord.rb in ensembl-0.0.6 vs lib/ensembl/variation/activerecord.rb in ensembl-0.0.7

- old
+ new

@@ -5,16 +5,11 @@ class Connection < ActiveRecord::Base self.extend TableNameOverrides self.abstract_class = true - self.establish_connection :adapter => "mysql2", - :host => Ensembl.host, - :username => Ensembl.username, - :password => Ensembl.password, - :database => Ensembl.species+'_variation_'+Ensembl.version+'_'+Ensembl.hg_version, - :reconnect => true + self.establish_connection :variation end class ModelBase < Connection self.extend PrimaryKeyOverrides @@ -40,24 +35,38 @@ belongs_to :associated_study, foreign_key: 'study2_id', class_name: 'Study' end class Attrib < ModelBase - self.extend SearchByAttribute + # self.extend AttributeLike belongs_to :attrib_type end class AttribSet < ModelBase belongs_to :attrib end class AttribType < ModelBase + has_many :attribs, class_name: 'Attrib' has_many :pheotype_feature_attrib has_many :phenotype_features, through: :phenotype_feature_attrib + scope :common_values, -> { where(attrib_type_id: self.mapping_hash.keys)} + + def self.mapping_hash + @mapping_hash||={14=>:risk_allele,15=>:p_value,23=>:odds_ratio,24=>:beta} + end + + def self.key(value) + mapping_hash.key(value) + end + + def self.symbol(key) + mapping_hash[key] + end end class CompressedGenotypeRegion < Connection belongs_to :individual belongs_to :seq_region, class_name: 'Ensembl::Core::SeqRegion' @@ -72,15 +81,13 @@ # To decrease number of DB queries needed # FIXME: Should be in GenotypeCodes class or should use caching allele_codes=GenotypeCode.eager_load(:allele_code).where(:genotype_code_id=>genotype_code_ids.uniq).inject({}){|hsh,gc|hsh[gc.genotype_code_id]=gc.allele_code.allele;hsh} - #genotype_code_ids.uniq.inject({}) { |hsh, gc_id | hsh[gc_id]=GenotypeCode.find(gc_id).allele_code.allele;hsh } - @igs||=unpacked_genotypes.map{|s| IndividualGenotype.new({ individual_id: s[0], - genotype_code_id:s[1], + genotype_code_id: s[1], allele: allele_codes[s[1]] })} end def unpacked_genotypes unpack_genotypes.each_slice(2).map{|sl| sl } @@ -125,13 +132,23 @@ has_one :failed_description end class GenotypeCode < ModelBase - belongs_to :allele_code + def self.genotype_for(genotype_code_id) + joins(:allele_code).where(genotype_code_id: genotype_code_id).order(:haplotype_id).pluck('allele_code.allele').join('|') + end + + def self.genotypes_for(genotype_code_ids) + includes(:allele_code).where(genotype_code_id: genotype_code_ids).pluck('genotype_code.genotype_code_id','genotype_code.haplotype_id','allele_code.allele').group_by{|r| r[0]}.map{|k,v| [k,v.sort_by{|f,s| f[1]<=>s[1]}.map{|v| v[2]}.join('|')]} + end + + def self.genotypes_hash_for(genotype_code_ids) + genotypes_for(genotype_code_ids).to_h + end end class Individual < ModelBase belongs_to :individual_type belongs_to :father, foreign_key: 'father_individual_id', class_name: 'Individual' @@ -159,12 +176,16 @@ class IndividualPopulation < Connection belongs_to :individual belongs_to :population - scope :displayable, -> { joins(:population).where(population: {display:true})} - scope :by_ids, ->(ids) { where(individual_id: ids) } + scope :displayable, -> { joins(:population).merge(Population.displayable) } + scope :thousand_genomes, -> { joins(:population).merge(Population.thousand_genomes)} + + scope :by_individual_ids, ->(ids) { where(individual_id: ids) } + + end class IndividualSynonym < Connection belongs_to :individual belongs_to :source @@ -187,10 +208,21 @@ belongs_to :variation_feature end class Phenotype < ModelBase has_many :phenotype_features + + def studies + ids=phenotype_features + .with_studies + .uniq + .pluck(:study_id) + + return nil unless ids.size > 0 # + + Study.where(study_id: ids) + end end class PhenotypeFeature < ModelBase # FIXME: Hack because using type column in the database self.inheritance_column = ':_no_inheritance_column' @@ -203,23 +235,57 @@ belongs_to :seq_region, class_name: 'Ensembl::Core::SeqRegion' has_many :phenotype_feature_attribs has_many :attrib_types, through: :phenotype_feature_attribs + scope :significant, -> { where(is_significant: true )} + scope :with_studies, -> { where.not(study_id:nil)} + def variation Variation.find_by name: object_id end + def risk_allele + pf=phenotype_feature_attribs.risk_alleles.first + pf.value unless pf.nil? + end + + def p_value + pf=phenotype_feature_attribs.p_values.first + pf.value unless pf.nil? + end + + def odds_ratio + pf=phenotype_feature_attribs.odds_ratios.first + pf.value unless pf.nil? + end + + def description + phenotype.description + end + end class PhenotypeFeatureAttrib < Connection belongs_to :attrib_type belongs_to :phenotype_feature + + scope :risk_alleles, -> { + where(attrib_type_id: AttribType.key(:risk_allele)) } + + scope :p_values, -> { + where(attrib_type_id: AttribType.key(:p_value)) } + + scope :odds_ratios, -> { + where(attrib_type_id: AttribType.key(:odds_ratio))} + + scope :betas, -> { + where(attrib_type_id: AttribType.key(:beta))} end class Population < ModelBase - self.extend Ensembl::SearchByName + # self.extend Ensembl::AttributeLike has_many :alleles has_many :population_synonyms has_many :individual_populations @@ -232,10 +298,11 @@ has_many :super_populations, through: :population_structures, source: :super_populaton has_many :population_genotypes scope :displayable, -> { where(display:'LD')} + scope :thousand_genomes, -> { displayable.starts_with(:name,'1000GENOMES')} def all_individual_populations IndividualPopulation.where(population_id: sub_population_ids(self)<<id) end @@ -287,20 +354,12 @@ class RegulatoryFeatureVariation < ModelBase belongs_to :variation_feature end - # class SeqRegion < Ensembl::Core::SeqRegion - # belongs_to :coord_system - # has_many :compressed_genotype_regions - # has_many :phenotype_features - # has_many :structureal_variation_features - # end - class StrainGtypePoly < Connection belongs_to :variation - end class StructuralVariation < ModelBase belongs_to :source belongs_to :study @@ -345,19 +404,29 @@ belongs_to :individual belongs_to :strain, foreign_key: 'strain_id', class_name: 'Individual' end class Source < ModelBase + has_many :studies + + scope :no_db_gap, -> { where.not(source_id: 46)} end class Study < ModelBase + # include AttributeLike + + default_scope -> { includes(:source) } + + belongs_to :source + has_many :associate_studies, foreign_key: 'study1_id' has_many :associated_studies, through: :associate_studies, source: :associated_study # FIXME: No data in database has_many :study_variations has_many :variations, through: :study_variations + end # FIXME: No data in database class StudyVariation < Connection belongs_to :study @@ -392,11 +461,10 @@ class TranslationMd5 < ModelBase end class Variation < ModelBase - self.extend Ensembl::SearchByName belongs_to :source has_many :variation_synonyms @@ -418,57 +486,125 @@ def phenotype_features PhenotypeFeature.eager_load(:phenotype).where(object_id_column: name, type: 'Variation') end - def synonyms - variation_synonyms.map{ |vs| vs.name } + def all_phenotype_features + object_ids = synonyms + object_ids<<name + PhenotypeFeature.eager_load(:phenotype).where(object_id: object_ids, type: 'Variation') end - def genotype_frequencies - igs=compressed_genotype_vars - .map{|cgv| cgv.individual_genotypes } - .flatten - .each_with_object(Hash.new){ |o,hsh| hsh[o.individual_id] = o.allele;hsh} + # Made because of the need to cut down database queries + # @return + # { phenotype_feature_id => + # { :phenotype=> "Phenotype description" , + # :phenotype_id => _ , + # :p_value => _ , + # :odds_ratio => _, + # :risk_allele => _ }, + # phenotype_feature_id => + # { :phenotype=> "Phenotype description" , + # :phenotype_id => _ , + # :p_value => _ , + # :odds_ratio => _, + # :risk_allele => _ }} + def phenotype_features_hash - counts=Hash.new 0 + # Do enable two level inserts hsh[:first][:second] + hash=Hash.new{ |hsh,key| hsh[key] = Hash.new {} } - IndividualPopulation - .displayable - .by_ids(igs.keys) - .map{|ip| [ip.population_id,igs[ip.individual_id]] } - .each{|pig| counts[pig]+=1 } + all_phenotype_features + .joins(:phenotype) + .pluck( + :phenotype_feature_id, + 'phenotype.description', + :phenotype_id) + .each{ |r| hash[r[0]][:phenotype]=r[1]; hash[r[0]][:phenotype_id]=r[2]} - counts.group_by{|k,v| k[0]} + PhenotypeFeatureAttrib + .where(phenotype_feature_id: hash.keys) + .pluck( + 'phenotype_feature_attrib.phenotype_feature_id', + 'phenotype_feature_attrib.value', + 'phenotype_feature_attrib.attrib_type_id') + .each{ |v| hash[v[0]][AttribType.symbol(v[2])]=v[1] } + + hash end - def individual_populations(individual_ids) - IndividualPopulation - .joins(:population) - .where(population: { display:true }) - .where(individual_population: { individual_id: individual_ids }) + def synonym_names + variation_synonyms.map{|vs| vs.name} end + # Genotype counts for each population + # @returns {"CSHL-HAPMAP:HapMap-CEU"=>{"C|T"=>59, "C|C"=>102, "T|T"=>12}, + # "CSHL-HAPMAP:HapMap-YRI"=>{"C|C"=>172, "C|T"=>1}} + def genotype_counts + counts = Hash.new{ |hsh,k| hsh[k] = Hash.new 0 } + + individual_populations.pluck('population.name',:individual_id).map{|ip| [ip[0],genotype_codes[individual_genotypes[ip[1]]]] }.each{|r| counts[r[0]][r[1]]+=1} + + return counts + end + + # Individual and genotype_code id's related to variation + # @returns + # Example: + # [[1,2],[2,3],[<individual_id>,<genotype_code_id>]] + def individual_genotypes + @individual_genotypes||=compressed_genotype_vars.map{|cgv| cgv.unpacked_genotypes }.flatten(1).to_h + end + + def individual_genotype_ids + individual_genotypes.keys + end + + # IndividualPopulations from individual_genotypes + # @returns [IndividualPopulation,IndividualPopulation,...] + def individual_populations + IndividualPopulation.where(individual_id: individual_genotype_ids) + end + + def genotype_code_ids + @genotype_code_ids||=individual_genotypes.values.uniq + end + + # Unique genotype codes from individual_genotypes + # @returns [<genotype_code_id>=>'G|C',2=>'A|A'] + def genotype_codes + @genotype_codes||=GenotypeCode.genotypes_hash_for(genotype_code_ids) + end + # Find Variation by also using VariationSynonyms # @name: name of the variation # @return: [Variation] def self.find_by_name(name) v = self.find_by(name: name) return v unless v.nil? vs = VariationSynonym.eager_load(:variation).find_by(name: name) - vs.variation unless vs.nil? + return vs.variation unless vs.nil? + nil end - def all_phenotype_features - object_ids = variation_synonyms.pluck :name - object_ids<<name - PhenotypeFeature.where(object_id: object_ids, type: 'Variation') + def self.find_all_by_name(name) + v_ids = where(name: name).pluck(:variation_id) + v_ids = variation_synonyms.where(name: name).pluck(:variation_id) if v_ids.nil? + + return nil if v_ids.nil? + + where(variation_id: v_ids).order(:name) end - # def population_genotypes - # PopulationGenotype.where(variation_id: id) - # end + def genes + variation_genenames.pluck(:gene_name) + end + + def positions + variation_features.includes(:seq_region).pluck('seq_region.name',:seq_region_start,:seq_region_end,:seq_region_strand).map{|r| Ensembl::Helpers::VariationPosition.new(r)} + end + end class VariationCitation < Connection self.table_name = 'variation_citation' belongs_to :variation @@ -486,10 +622,19 @@ def variation_sets VariationSets.where[variation_set_id: [variation_set_id.split(',').map{|id| id.to_i }]] unless variation_set_id.nil? end + def strand_name(id) + case(id) + when 1 + 'forward' + else + 'reverse' + end + end + def class_type Attrib.find(class_attrib_id) unless class_attrib_id.nil? end end @@ -500,11 +645,11 @@ class VariationHgvs < Connection belongs_to :variation end class VariationSet < ModelBase - self.extend Ensembl::SearchByName + # self.extend Ensembl::SearchByName belongs_to :short_name, foreign_key: 'short_name_attrib_id', class_name: 'Attrib' has_many :structural_variations has_many :sub_variation_set_structures, foreign_key: 'variation_set_super', class_name: 'VariationSetStructure' @@ -534,8 +679,21 @@ end class VariationSynonym < ModelBase belongs_to :variation belongs_to :source + + scope :name_like, ->(name, search_type=:starts_with){ + at=self.arel_table + + if search_type == :ends_with + where(at[:name].matches("%#{name}")) + elsif search_type == :starts_with + where(at[:name].matches("#{name}%")) + else + where(at[:name].matches("%#{name}%")) + end + + } end end end \ No newline at end of file