Sha256: 8b1c82218753c3373835f23e1798fd24471694e822e8f96d1fcb08d6890ba76d

Contents?: true

Size: 1.88 KB

Versions: 2

Compression:

Stored size: 1.88 KB

Contents

module Sites
  class JobsRails42 < Base

    # @TODO/NOTE: There is pagination on this site, it would be cool to find a way
    # to grab more offers than just first page (25 items)

    # I had to rename this class because we are not allowed to have numbers
    # on the beginning of the class name (42JobsRails won't work).

    HOST = 'https://www.42jobs.io'.freeze
    PROGRAMMING = '/rails/jobs-remote'.freeze
    JOB_ITEM_SELECTOR = 'li.job-offers__item a'.freeze
    STORE_DIR = 'data/jobs_rails42'.freeze

    NUMBER_OF_PAGES = 10

    def initialize(job_type: :programming)
      @job_type = job_type
      @url = build_url
      @doc = nil
      @current_time = Time.new
      @timestamp = @current_time.strftime("%Y%m%d%H%M%S")
      @count = get_count
    end

    def collect_jobs
      (1..NUMBER_OF_PAGES).to_a.each do |page|
        current_page = "#{@url}?page=#{page}"
        doc = Nokogiri::HTML(open_page(current_page))
        process_page(doc, current_page, page)
      end
    end

    private

    def process_page(doc, page_url, page)
      puts "[Info] Getting the data from #{page_url} at #{@current_time}..."
      FileUtils.mkdir_p STORE_DIR

      CSV.open(filepath, 'ab') do |csv|
        doc.css(JOB_ITEM_SELECTOR).each do |link|
          job_url = "#{HOST}#{link["href"]}"
          puts "[Info] Processing #{job_url}..."
          job_page = Nokogiri::HTML(open_page(job_url))
          offer_text = job_page.css('.job-offer__description').to_s

          location = Support::OfferParser.get_location(offer_text)
          region   = nil
          keywords = Support::OfferParser.get_keywords(offer_text)

          csv << [job_url, location, region, keywords]
        end
      end

      puts "[Done] Collected #{@count} job offers from #{url}. Data stores in: #{filepath}." if page == NUMBER_OF_PAGES
    end

    private

    def get_count
      25 * NUMBER_OF_PAGES
    end
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
remote_job_scraper-0.2.0 lib/sites/jobs_rails42.rb
remote_job_scraper-0.1.0 lib/sites/jobs_rails42.rb