Sha256: 11cd1d36c5cf253280a4a3299250786832fd198695a0c17cd1c4cd15f15a070c

Contents?: true

Size: 1.7 KB

Versions: 21

Compression:

Stored size: 1.7 KB

Contents

require 'rbbt/util/misc'

plural = Proc.new do |t| t.sub(/s$/,'') end

tokens do

  # Some (possible) single letters first
  receptor     /^(?:receptor|r)s?$/i 
  protein      /^(?:protein|p)s?$/i 
  roman        /^[IV]+$/ 
  greek_letter do |w| $inverse_greek[w.downcase] != nil end
  

  # Some words for removal 
  stopword     do |w|  $stopwords.include?( w.downcase_first)  end
  gene         /genes?/i
  dna
  cdna
  rna
  mrna
  trna
  cdna
  component
  exon
  intron
  domain
  family


  # Important words
  number       /^(?:\d+[.,]?\d+|\d)$/ 
  greek        do |w| $greek[w.downcase] != nil end
  special      do |w| w.is_special? end 
  promoter
  similar      /^(homolog.*|like|related|associated)$/ 
  ase          /ase$/ 
  in_end       /in$/ 
end

comparisons do 

  compare.number do |l1,l2|
      v = 0
      case
      when l1.empty? && l2.empty?
          v = 0
      when l1.sort.uniq == l2.sort.uniq
          v = 3
      when l1.any? && l1[0] == l2[0] 
          v = -3   
      when l1.empty? && l2 == ['1'] 
          v = -5   
      else 
          v = -10
      end
      v
  end

  diff.promoter   -10 
  diff.receptor   -10 
  diff.similar    -10 
  diff.capital    -10 

  same.unknown      1
  miss.unknown      -2 
  extr.unknown      -2 

  same.greek      1
  miss.greek      -2 
  extr.greek      -2 

  same.special    4
  miss.special    -3 
  extr.special    -3 

  transform.receptor plural
  transform.protein plural

  transform.roman do |t| [t.arabic, :number] end
  transform.greek_letter do |t| [$inverse_greek[t.downcase], :greek] end
  transform.ase do |t| [t, :special] end
  transform.in_end do |t| [t, :special] end
  transform.unknown do |t| [t, (t.length < 4 ? :special : :unknown)] end
end

Version data entries

21 entries across 21 versions & 1 rubygems

Version Path
rbbt-text-1.3.8 share/rnorm/tokens_default
rbbt-text-1.3.7 share/rnorm/tokens_default
rbbt-text-1.3.6 share/rnorm/tokens_default
rbbt-text-1.3.5 share/rnorm/tokens_default
rbbt-text-1.3.4 share/rnorm/tokens_default
rbbt-text-1.3.3 share/rnorm/tokens_default
rbbt-text-1.3.2 share/rnorm/tokens_default
rbbt-text-1.3.1 share/rnorm/tokens_default
rbbt-text-1.3.0 share/rnorm/tokens_default
rbbt-text-1.2.0 share/rnorm/tokens_default
rbbt-text-1.1.9 share/rnorm/tokens_default
rbbt-text-1.1.8 share/rnorm/tokens_default
rbbt-text-1.1.7 share/rnorm/tokens_default
rbbt-text-1.1.6 share/rnorm/tokens_default
rbbt-text-1.1.5 share/rnorm/tokens_default
rbbt-text-1.1.4 share/rnorm/tokens_default
rbbt-text-1.1.3 share/rnorm/tokens_default
rbbt-text-1.1.2 share/rnorm/tokens_default
rbbt-text-1.1.1 share/rnorm/tokens_default
rbbt-text-1.1.0 share/rnorm/tokens_default