Sha256: 60adc13ca1441247da94311d681e3f8ef834f8ebefe85372321085375e754eee

Contents?: true

Size: 1019 Bytes

Versions: 14

Compression:

Stored size: 1019 Bytes

Contents

module Columnizer
   
   def self.to_normalized_column(text)
      words_to_omit = ["a", "to", "the", "of", "has", "have", "it", "is", "in", "on", "or", "but", "when", "be"]
      
      # strip all the html tags from the text data
      col_text = text.gsub(/(<[^>]*>)|\n|\t/s, ' ')
      
      # Removing capitalization
      col_text.downcase!
      # Removing potential problem characters
      col_text.gsub!(/\"|\'/, '')
      # Removing text inside parens
      col_text.gsub!(/\(.*?\)/,'')
      
      # Removing all other non-word characters
      col_text.gsub!(/\W/, ' ')
      
      column_words = []
      words = col_text.split(' ')
      words.each do |word|
         if !words_to_omit.include?(word)
             column_words << word
         end
      end
      
      #reducing the word list to limit column length
      if column_words.length > 5
          column_words.slice!(0, column_words.length - 5)
      end
      
      #re-assemble the string
      column_words.join("_")
   end 
    
end

Version data entries

14 entries across 14 versions & 2 rubygems

Version Path
breakpointer-surveyor-0.2.0 script/surveyor/columnizer.rb
surveyor-0.8.0 script/surveyor/columnizer.rb
surveyor-0.7.1 script/surveyor/columnizer.rb
surveyor-0.7.0 script/surveyor/columnizer.rb
surveyor-0.6.10 script/surveyor/columnizer.rb
surveyor-0.6.9 script/surveyor/columnizer.rb
surveyor-0.6.8 script/surveyor/columnizer.rb
surveyor-0.6.7 script/surveyor/columnizer.rb
surveyor-0.6.6 script/surveyor/columnizer.rb
surveyor-0.6.5 script/surveyor/columnizer.rb
surveyor-0.6.4 script/surveyor/columnizer.rb
surveyor-0.6.3 script/surveyor/columnizer.rb
surveyor-0.6.2 script/surveyor/columnizer.rb
surveyor-0.6.1 script/surveyor/columnizer.rb