Sha256: ad022e63ae2c0a5919fe0cc70444e98cebaff76c29622ea7c2b6b832b430dd24

Contents?: true

Size: 1.85 KB

Versions: 1

Compression:

Stored size: 1.85 KB

Contents

# frozen_string_literal: true

module Boxcars
  # A Boxcar that reads text from a URL.
  class URLText < Boxcar
    # the description of this boxcar
    DESC = "useful when you want to get text from a URL."

    # implements a boxcar that uses the Google SerpAPI to get answers to questions.
    # @param name [String] The name of the boxcar. Defaults to classname.
    # @param description [String] A description of the boxcar. Defaults to SERPDESC.
    def initialize(name: "FetchURL", description: DESC)
      super(name: name, description: description)
    end

    # Get text from a url.
    # @param url [String] The url
    # @return [String] The text for the url.
    def run(url)
      url = URI.parse(url)
      get_answer(url)
    end

    private

    def html_to_text(url, response)
      Nokogiri::HTML(response.body).css(%w[h1 h2 h3 h4 h5 h6 p a].join(",")).map do |e|
        itxt = e.inner_text.strip
        itxt = itxt.gsub(/[[:space:]]+/, " ") # remove extra spaces
        # next if itxt.nil? || itxt.empty?
        if e.name == "a"
          href = e.attributes["href"]&.value
          href = URI.join(url, href).to_s if href =~ %r{^/}
          "[#{itxt}](#{href})" # if e.attributes["href"]&.value =~ /^http/
        else
          itxt
        end
      end.compact.join("\n\n")
    end

    def get_answer(url)
      response = Net::HTTP.get_response(url)
      if response.is_a?(Net::HTTPSuccess)
        return Result.from_text(response.body) if response.content_type == "text/plain"

        if response.content_type == "text/html"
          # return only the top level text
          txt = html_to_text(url, response)
          Result.from_text(txt)
        else
          Result.from_text(response.body)
        end
      else
        Result.new(status: :error, explanation: "Error with url: #{response.code} #{response.message}")
      end
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
boxcars-0.3.2 lib/boxcars/boxcar/url_text.rb