Sha256: df92476f9a4a7d46ebb76e637f4a3f05c4eda6fc18d16ddb5ff791563694369e

Contents?: true

Size: 862 Bytes

Versions: 3

Compression:

Stored size: 862 Bytes

Contents

require 'hpricot'
require 'rest_client'

module Jkl
  
  def self.sanitize(text)
    str = ""
    text = text.to_s.gsub(/((<[\s\/]*script\b[^>]*>)([^>]*)(<\/script>))/i,"") #remove script tags - with contents
    text.to_s.gsub(/<\/?[^>]*>/, "").split("\r").each do |l| # remove all tags
      l = l.gsub(/^[ \t]/,"") #remove tabs
      l = l.gsub(/^[ \s]/,"")
      l.split("\n").each do |l|
        str << l unless l.count(" ") < 5 # remove short lines - ususally just navigation
      end
    end
    str
  end
  
  def self.from_doc(response)
    begin
      Hpricot(response)
    rescue  URI::InvalidURIError => e
      puts("WARN: Problem with getting a connection: #{e}")
    rescue SocketError => e
      puts("WARN: Could not connect to feed: #{e}")
    rescue Errno::ECONNREFUSED  => e
      puts("WARN: Connection refused: #{e}")
    end
  end
  
end

Version data entries

3 entries across 3 versions & 1 rubygems

Version Path
jakal-0.0.9 lib/jkl/url_doc_handler.rb
jakal-0.0.8 lib/jkl/url_doc_handler.rb
jakal-0.0.7 lib/jkl/url_doc_handler.rb