require 'nokogiri'; require_relative 'parsing_tools/capybara_with_phantom_js' module Janis module Parsing module SpecificParsers class ProxyListOrgParser < ProxyWebsiteParser include CapybaraWithPhantomJs Struct.new('Row', :proxy, :country, :city, :type, :speed, :https_ssl) def self.url 'http://proxy-list.org' end def initialize super configure_capybara @session = new_session @session.visit(url) obtain_html_doc end def configure_capybara Capybara.configure { |c| c.app_host = url } end def parse total_rows = [] total_rows += rows [2,3,4,5,6,7,8,9,10].each do |page_number| @session.click_link(page_number.to_s) obtain_html_doc total_rows += rows end total_rows #TODO: This map is here to adapt #parse output to the one expected by Janis.find. Remove this when it starts accepting #more info about each proxy server. total_rows.map do |row| row.proxy end end private def obtain_html_doc @html_doc = Nokogiri.HTML(@session.html) end def rows rows_in_html = @html_doc.css('ul').select { |ul| ul.to_s.match /\d\d\d\./} results = rows_in_html.map do |row_html| row_object = Struct::Row.new( #TODO: This should be an actual class, and should have methods to retrieve all attributes. row_html.css('.proxy').children.last.text, row_html.css('.country').text, row_html.css('.city').text, row_html.css('.type').text, row_html.css('.speed').text, row_html.css('.https').text ) end end end end end end