Sha256: ad022e63ae2c0a5919fe0cc70444e98cebaff76c29622ea7c2b6b832b430dd24
Contents?: true
Size: 1.85 KB
Versions: 1
Compression:
Stored size: 1.85 KB
Contents
# frozen_string_literal: true module Boxcars # A Boxcar that reads text from a URL. class URLText < Boxcar # the description of this boxcar DESC = "useful when you want to get text from a URL." # implements a boxcar that uses the Google SerpAPI to get answers to questions. # @param name [String] The name of the boxcar. Defaults to classname. # @param description [String] A description of the boxcar. Defaults to SERPDESC. def initialize(name: "FetchURL", description: DESC) super(name: name, description: description) end # Get text from a url. # @param url [String] The url # @return [String] The text for the url. def run(url) url = URI.parse(url) get_answer(url) end private def html_to_text(url, response) Nokogiri::HTML(response.body).css(%w[h1 h2 h3 h4 h5 h6 p a].join(",")).map do |e| itxt = e.inner_text.strip itxt = itxt.gsub(/[[:space:]]+/, " ") # remove extra spaces # next if itxt.nil? || itxt.empty? if e.name == "a" href = e.attributes["href"]&.value href = URI.join(url, href).to_s if href =~ %r{^/} "[#{itxt}](#{href})" # if e.attributes["href"]&.value =~ /^http/ else itxt end end.compact.join("\n\n") end def get_answer(url) response = Net::HTTP.get_response(url) if response.is_a?(Net::HTTPSuccess) return Result.from_text(response.body) if response.content_type == "text/plain" if response.content_type == "text/html" # return only the top level text txt = html_to_text(url, response) Result.from_text(txt) else Result.from_text(response.body) end else Result.new(status: :error, explanation: "Error with url: #{response.code} #{response.message}") end end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
boxcars-0.3.2 | lib/boxcars/boxcar/url_text.rb |