lib/pets/genomic_features.rb in pets-0.2.4 vs lib/pets/genomic_features.rb in pets-0.2.5
- old
+ new
@@ -1,25 +1,61 @@
class Genomic_Feature
+ @@ref = nil
+
+ def self.array2genomic_feature(arr)
+ new(arr.map{|r| yield(r)})
+ end
+
+ def self.hash2genomic_feature(h)
+ vars = []
+ h.each do |h, v|
+ vars << yield(h, v)
+ end
+ new(vars)
+ end
+
+ def self.add_reference(genomic_regions)
+ @@ref = genomic_regions
+ end
+
#If any method use gen_fet as name is a Genomic_Feature object
- def initialize(feat_array) # [[chr1, start1, stop1],[chr1, start1, stop1]]
+ def initialize(feat_array, annotations: nil) # [[chr1, start1, stop1],[chr1, start1, stop1]]
@regions = {}
+ @reg_by_to = {}
@reg_id = -1
load_features(feat_array)
+ load_annotations(annotations) if !annotations.nil?
end
def load_features(feat_array)
- feat_array.each do |chr, start, stop|
+ feat_array.each do |chr, start, stop, to|
chr = chr.to_sym
- region = {start: start, stop: stop, to: @reg_id +=1 }
+ @reg_id +=1
+ id = to.nil? ? @reg_id : to
+ region = {chr: chr, start: start, stop: stop, to: id }
+ @reg_by_to[id] = region
add_record(@regions, chr, region)
end
end
+ def load_annotations(annotations)
+ each do |chr, reg|
+ annot = annotations[reg[:to]]
+ reg[:attrs] = annot if !annot.nil?
+ end
+ end
+
def length
return @regions.length
end
+ def each_chr()
+ @regions.each do |chr, regs|
+ yield(chr, regs)
+ end
+ end
+
def each()
@regions.each do |chr, regs|
regs.each do |region|
yield(chr, region)
end
@@ -28,19 +64,56 @@
def get_chr
return @regions.keys
end
+ def get_chr_regs(chr)
+ return @regions[chr]
+ end
+
+ def region_by_to(to)
+ return @reg_by_to[to]
+ end
+
def get_sizes
sizes = []
each do |chr, region|
size = region[:stop] - region[:start] + 1
sizes << size
end
return sizes
end
+ def get_features(attr_type: nil)
+ features = match(@@ref)
+ if !attr_type.nil?
+ features.each do |reg_id, feat_ids|
+ new_feat_ids = feat_ids.map{|fi| @@ref.region_by_to(fi).dig(:attrs, attr_type)}
+ features[reg_id] = new_feat_ids.compact.uniq
+ end
+ end
+ return features
+ end
+
+ def match(other_gen_feat)
+ all_matches = {}
+ each_chr do |chr, regs|
+ other_regs = other_gen_feat.get_chr_regs(chr)
+ next if other_regs.nil?
+ regs.each do |reg|
+ local_matches = []
+ start = reg[:start]
+ stop = reg[:stop]
+ other_regs.each do |other_reg|
+ local_matches << other_reg[:to] if coor_overlap?(start, stop, other_reg)
+ end
+ all_matches[reg[:to]] = local_matches if !local_matches.empty?
+ end
+ end
+ return all_matches
+ end
+
def get_summary_sizes
sizes = Hash.new(0)
each do |chr, region|
size = region[:stop] - region[:start] + 1
sizes[size] += 1
@@ -58,25 +131,25 @@
def get_reference_overlaps(genomic_ranges, reference)
overlaps = []
reference.each do |start, stop|
reg_ids = []
genomic_ranges.each do |reg|
- reg_ids << reg[:to] if coor_overlap?(start, stop, reg)
+ overlap = coor_overlap?(start, stop, reg)
+ reg_ids << reg[:to] if overlap
end
overlaps << reg_ids.uniq
end
return overlaps
end
- def generate_cluster_regions(meth, tag, ids_per_reg = 1)
+ def generate_cluster_regions(meth, tag, ids_per_reg = 1, obj = false)
compute_windows(meth) # Get putative genome windows
- patients_out_of_cluster = 0
ids_by_cluster = {}
annotated_full_ref = [] # All reference windows wit uniq id and chr tagged
@regions.each do |chr, regs|
reference = @windows[chr]
- overlaps = get_reference_overlaps(regs, reference) # See what patient has match with a overlap region
+ overlaps = get_reference_overlaps(regs, reference)
clust_numb = 0
reference.each_with_index do |ref, i|
current_ids = overlaps[i]
if current_ids.length > ids_per_reg
clust_id = "#{chr}.#{clust_numb +=1}.#{tag}.#{current_ids.length}"
@@ -85,10 +158,11 @@
end
annotated_full_ref << ref.dup.concat([chr, clust_id])
end
end
end
+ annotated_full_ref = Genomic_Feature.array2genomic_feature(annotated_full_ref){|r| [r[2], r[0], r[1], r[3]]} if obj
return ids_by_cluster, annotated_full_ref
end
def compute_windows(meth)
@windows = {}
@@ -114,18 +188,40 @@
end
end
def compute_region_overlap_windows(genomic_ranges)
reference = []
- reference.concat(genomic_ranges.map{|gr| gr[:start]})# get start
- reference.concat(genomic_ranges.map{|gr| gr[:stop]})# get stop
+ single_nt = []
+ genomic_ranges.each do |gr|
+ start = gr[:start]
+ stop = gr[:stop]
+ if stop - start > 0
+ reference << start # get start
+ reference << stop # get stop
+ else # Build a window of at least one nt for snv
+ single_nt << start
+ end
+ end
reference.uniq!
+ single_nt.each do |snt| # add start stop for snv
+ reference << snt
+ reference << snt
+ end
reference.sort!
#Define overlap ranges
final_reference = []
+ last_len = 1
reference.each_with_index do |coord,i|
next_coord = reference[i + 1]
- final_reference << [coord, next_coord] if !next_coord.nil?
+ if !next_coord.nil?
+ current_len = next_coord - coord
+ coord = coord + 1 if last_len == 0 # Separate SNV window from others
+ if current_len == 0 && last_len > 0 && !final_reference.empty?
+ final_reference.last[1] -= 1 # Separate SNV window from others
+ end
+ final_reference << [coord, next_coord]
+ last_len = current_len
+ end
end
return final_reference
end
def coor_overlap?(start, stop, reg)