lib/linkedin-scraper/profile.rb in linkedin-scraper-0.0.9 vs lib/linkedin-scraper/profile.rb in linkedin-scraper-0.0.10

- old
+ new

@@ -45,17 +45,17 @@ @agent = Mechanize.new @agent.user_agent_alias = USER_AGENTS.sample @agent.max_history = 0 page = @agent.get(url) return Linkedin::Profile.new(page, url) - rescue=>e + rescue => e puts e end end def get_skills(page) - page.search('.competency.show-bean').map{|skill|skill.text.strip if skill.text} + page.search('.competency.show-bean').map{|skill|skill.text.strip if skill.text} rescue nil end def get_company_url(node) result={} if node.at("h4/strong/a") @@ -186,103 +186,117 @@ def get_organizations(page) organizations = [] # if the profile contains org data if page.search('ul.organizations li.organization').first - # loop over each element with org data page.search('ul.organizations li.organization').each do |item| - # find the h3 element within the above section and get the text with excess white space stripped - name = item.search('h3').text.gsub(/\s+|\n/, " ").strip - position = nil # add this later - occupation = nil # add this latetr too, this relates to the experience/work - start_date = Date.parse(item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').first) - if item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').last == 'Present' - end_date = nil - else - Date.parse(item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').last) - end - organizations << { name: name, start_date: start_date, end_date: end_date } - end + begin + # find the h3 element within the above section and get the text with excess white space stripped + name = item.search('h3').text.gsub(/\s+|\n/, " ").strip + position = nil # add this later + occupation = nil # add this latetr too, this relates to the experience/work + start_date = Date.parse(item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').first) + if item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').last == 'Present' + end_date = nil + else + Date.parse(item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').last) + end + organizations << { name: name, start_date: start_date, end_date: end_date } + rescue => e + + end + end return organizations - end # page.search('ul.organizations li.organization').first + end end def get_languages(page) languages = [] # if the profile contains org data if page.search('ul.languages li.language').first # loop over each element with org data page.search('ul.languages li.language').each do |item| - # find the h3 element within the above section and get the text with excess white space stripped - language = item.at('h3').text - proficiency = item.at('span.proficiency').text.gsub(/\s+|\n/, " ").strip - languages << { language:language, proficiency:proficiency } + begin + # find the h3 element within the above section and get the text with excess white space stripped + language = item.at('h3').text + proficiency = item.at('span.proficiency').text.gsub(/\s+|\n/, " ").strip + languages << { language:language, proficiency:proficiency } + rescue => e + end end return languages end # page.search('ul.organizations li.organization').first end def get_certifications(page) certifications = [] + # search string to use with Nokogiri query = 'ul.certifications li.certification' months = 'January|February|March|April|May|June|July|August|September|November|December' regex = /(#{months}) (\d{4})/ # if the profile contains cert data if page.search(query).first # loop over each element with cert data page.search(query).each do |item| - item_text = item.text.gsub(/\s+|\n/, " ").strip - name = item_text.split(" #{item_text.scan(/#{months} \d{4}/)[0]}")[0] - authority = nil # we need a profile with an example of this and probably will need to use the API to accuratetly get this data - license = nil # we need a profile with an example of this and probably will need to use the API to accuratetly get this data - start_date = Date.parse(item_text.scan(regex)[0].join(' ')) + begin + item_text = item.text.gsub(/\s+|\n/, " ").strip + name = item_text.split(" #{item_text.scan(/#{months} \d{4}/)[0]}")[0] + authority = nil # we need a profile with an example of this and probably will need to use the API to accuratetly get this data + license = nil # we need a profile with an example of this and probably will need to use the API to accuratetly get this data + start_date = Date.parse(item_text.scan(regex)[0].join(' ')) - includes_end_date = item_text.scan(regex).count > 1 - end_date = includes_end_date ? Date.parse(item_text.scan(regex)[0].join(' ')) : nil # we need a profile with an example of this and probably will need to use the API to accuratetly get this data + includes_end_date = item_text.scan(regex).count > 1 + end_date = includes_end_date ? Date.parse(item_text.scan(regex)[0].join(' ')) : nil # we need a profile with an example of this and probably will need to use the API to accuratetly get this data - certifications << { name:name, authority:authority, license:license, start_date:start_date, end_date:end_date } + certifications << { name:name, authority:authority, license:license, start_date:start_date, end_date:end_date } + rescue => e + end end return certifications end + end def get_organizations(page) organizations = [] # if the profile contains org data if page.search('ul.organizations li.organization').first # loop over each element with org data page.search('ul.organizations li.organization').each do |item| - # find the h3 element within the above section and get the text with excess white space stripped - name = item.search('h3').text.gsub(/\s+|\n/, " ").strip - position = nil # add this later - occupation = nil # add this latetr too, this relates to the experience/work - start_date = Date.parse(item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').first) - if item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').last == 'Present' - end_date = nil - else - Date.parse(item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').last) - end + begin + # find the h3 element within the above section and get the text with excess white space stripped + name = item.search('h3').text.gsub(/\s+|\n/, " ").strip + position = nil # add this later + occupation = nil # add this latetr too, this relates to the experience/work + start_date = Date.parse(item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').first) + if item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').last == 'Present' + end_date = nil + else + Date.parse(item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').last) + end - organizations << { name: name, start_date: start_date, end_date: end_date } + organizations << { name: name, start_date: start_date, end_date: end_date } + rescue => e + end end - - return organizations - end # page.search('ul.organizations li.organization').first + end + return organizations end + def get_recommended_visitors(page) recommended_vs=[] if page.search(".browsemap").first page.at(".browsemap").at("ul").search("li").each do |visitor| v = {} @@ -293,7 +307,8 @@ recommended_vs << v end return recommended_vs end end + end end