Sha256: 8b1c82218753c3373835f23e1798fd24471694e822e8f96d1fcb08d6890ba76d
Contents?: true
Size: 1.88 KB
Versions: 2
Compression:
Stored size: 1.88 KB
Contents
module Sites class JobsRails42 < Base # @TODO/NOTE: There is pagination on this site, it would be cool to find a way # to grab more offers than just first page (25 items) # I had to rename this class because we are not allowed to have numbers # on the beginning of the class name (42JobsRails won't work). HOST = 'https://www.42jobs.io'.freeze PROGRAMMING = '/rails/jobs-remote'.freeze JOB_ITEM_SELECTOR = 'li.job-offers__item a'.freeze STORE_DIR = 'data/jobs_rails42'.freeze NUMBER_OF_PAGES = 10 def initialize(job_type: :programming) @job_type = job_type @url = build_url @doc = nil @current_time = Time.new @timestamp = @current_time.strftime("%Y%m%d%H%M%S") @count = get_count end def collect_jobs (1..NUMBER_OF_PAGES).to_a.each do |page| current_page = "#{@url}?page=#{page}" doc = Nokogiri::HTML(open_page(current_page)) process_page(doc, current_page, page) end end private def process_page(doc, page_url, page) puts "[Info] Getting the data from #{page_url} at #{@current_time}..." FileUtils.mkdir_p STORE_DIR CSV.open(filepath, 'ab') do |csv| doc.css(JOB_ITEM_SELECTOR).each do |link| job_url = "#{HOST}#{link["href"]}" puts "[Info] Processing #{job_url}..." job_page = Nokogiri::HTML(open_page(job_url)) offer_text = job_page.css('.job-offer__description').to_s location = Support::OfferParser.get_location(offer_text) region = nil keywords = Support::OfferParser.get_keywords(offer_text) csv << [job_url, location, region, keywords] end end puts "[Done] Collected #{@count} job offers from #{url}. Data stores in: #{filepath}." if page == NUMBER_OF_PAGES end private def get_count 25 * NUMBER_OF_PAGES end end end
Version data entries
2 entries across 2 versions & 1 rubygems
Version | Path |
---|---|
remote_job_scraper-0.2.0 | lib/sites/jobs_rails42.rb |
remote_job_scraper-0.1.0 | lib/sites/jobs_rails42.rb |