Sha256: 8e356adf3dbd17c75a9a75dc3108adff58e1ad5d5659fcaa2da645a0691f1847

Contents?: true

Size: 1.86 KB

Versions: 5

Compression:

Stored size: 1.86 KB

Contents

module Sites
  class JobsRails42 < Base

    # @TODO/NOTE: There is pagination on this site, it would be cool to find a way
    # to grab more offers than just first page (25 items)

    # I had to rename this class because we are not allowed to have numbers
    # on the beginning of the class name (42JobsRails won't work).

    HOST = 'https://www.42jobs.io'.freeze
    PROGRAMMING = '/rails/jobs-remote'.freeze
    JOB_ITEM_SELECTOR = 'li.job-offers__item a'.freeze
    STORE_DIR = 'data/jobs_rails42'.freeze

    def initialize(job_type: :programming, total_pages: 4)
      @job_type = job_type
      @url = build_url
      @doc = nil
      @current_time = Time.new
      @timestamp = @current_time.strftime("%Y%m%d%H%M%S")
      @total_pages = total_pages
      @count = get_count
    end

    def collect_jobs
      (1..@total_pages).to_a.each do |page|
        current_page = "#{@url}?page=#{page}"
        doc = Nokogiri::HTML(open_page(current_page))
        process_page(doc, current_page, page)
      end
    end

    private

    def process_page(doc, page_url, page)
      puts "[Info] Getting the data from #{page_url} at #{@current_time}..."
      FileUtils.mkdir_p STORE_DIR

      CSV.open(filepath, 'ab') do |csv|
        doc.css(JOB_ITEM_SELECTOR).each do |link|
          job_url = "#{HOST}#{link["href"]}"
          puts "[Info] Processing #{job_url}..."
          job_page = Nokogiri::HTML(open_page(job_url))
          offer_text = job_page.css('.job-offer__description').to_s

          location = Support::OfferParser.get_location(offer_text)
          keywords = Support::OfferParser.get_keywords(offer_text)

          csv << [job_url, location, keywords]
        end
      end

      puts "[Done] Collected #{@count} job offers from #{url}. Data stores in: #{filepath}." if page == @total_pages
    end

    private

    def get_count
      25 * @total_pages
    end
  end
end

Version data entries

5 entries across 5 versions & 1 rubygems

Version Path
remote_job_scraper-0.4.2 lib/sites/jobs_rails42.rb
remote_job_scraper-0.4.1 lib/sites/jobs_rails42.rb
remote_job_scraper-0.4.0 lib/sites/jobs_rails42.rb
remote_job_scraper-0.3.1 lib/sites/jobs_rails42.rb
remote_job_scraper-0.3.0 lib/sites/jobs_rails42.rb