lib/linkedin-scraper/profile.rb in linkedin-scraper-0.0.9 vs lib/linkedin-scraper/profile.rb in linkedin-scraper-0.0.10
- old
+ new
@@ -45,17 +45,17 @@
@agent = Mechanize.new
@agent.user_agent_alias = USER_AGENTS.sample
@agent.max_history = 0
page = @agent.get(url)
return Linkedin::Profile.new(page, url)
- rescue=>e
+ rescue => e
puts e
end
end
def get_skills(page)
- page.search('.competency.show-bean').map{|skill|skill.text.strip if skill.text}
+ page.search('.competency.show-bean').map{|skill|skill.text.strip if skill.text} rescue nil
end
def get_company_url(node)
result={}
if node.at("h4/strong/a")
@@ -186,103 +186,117 @@
def get_organizations(page)
organizations = []
# if the profile contains org data
if page.search('ul.organizations li.organization').first
-
# loop over each element with org data
page.search('ul.organizations li.organization').each do |item|
- # find the h3 element within the above section and get the text with excess white space stripped
- name = item.search('h3').text.gsub(/\s+|\n/, " ").strip
- position = nil # add this later
- occupation = nil # add this latetr too, this relates to the experience/work
- start_date = Date.parse(item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').first)
- if item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').last == 'Present'
- end_date = nil
- else
- Date.parse(item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').last)
- end
- organizations << { name: name, start_date: start_date, end_date: end_date }
- end
+ begin
+ # find the h3 element within the above section and get the text with excess white space stripped
+ name = item.search('h3').text.gsub(/\s+|\n/, " ").strip
+ position = nil # add this later
+ occupation = nil # add this latetr too, this relates to the experience/work
+ start_date = Date.parse(item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').first)
+ if item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').last == 'Present'
+ end_date = nil
+ else
+ Date.parse(item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').last)
+ end
+ organizations << { name: name, start_date: start_date, end_date: end_date }
+ rescue => e
+
+ end
+ end
return organizations
- end # page.search('ul.organizations li.organization').first
+ end
end
def get_languages(page)
languages = []
# if the profile contains org data
if page.search('ul.languages li.language').first
# loop over each element with org data
page.search('ul.languages li.language').each do |item|
- # find the h3 element within the above section and get the text with excess white space stripped
- language = item.at('h3').text
- proficiency = item.at('span.proficiency').text.gsub(/\s+|\n/, " ").strip
- languages << { language:language, proficiency:proficiency }
+ begin
+ # find the h3 element within the above section and get the text with excess white space stripped
+ language = item.at('h3').text
+ proficiency = item.at('span.proficiency').text.gsub(/\s+|\n/, " ").strip
+ languages << { language:language, proficiency:proficiency }
+ rescue => e
+ end
end
return languages
end # page.search('ul.organizations li.organization').first
end
def get_certifications(page)
certifications = []
+
# search string to use with Nokogiri
query = 'ul.certifications li.certification'
months = 'January|February|March|April|May|June|July|August|September|November|December'
regex = /(#{months}) (\d{4})/
# if the profile contains cert data
if page.search(query).first
# loop over each element with cert data
page.search(query).each do |item|
- item_text = item.text.gsub(/\s+|\n/, " ").strip
- name = item_text.split(" #{item_text.scan(/#{months} \d{4}/)[0]}")[0]
- authority = nil # we need a profile with an example of this and probably will need to use the API to accuratetly get this data
- license = nil # we need a profile with an example of this and probably will need to use the API to accuratetly get this data
- start_date = Date.parse(item_text.scan(regex)[0].join(' '))
+ begin
+ item_text = item.text.gsub(/\s+|\n/, " ").strip
+ name = item_text.split(" #{item_text.scan(/#{months} \d{4}/)[0]}")[0]
+ authority = nil # we need a profile with an example of this and probably will need to use the API to accuratetly get this data
+ license = nil # we need a profile with an example of this and probably will need to use the API to accuratetly get this data
+ start_date = Date.parse(item_text.scan(regex)[0].join(' '))
- includes_end_date = item_text.scan(regex).count > 1
- end_date = includes_end_date ? Date.parse(item_text.scan(regex)[0].join(' ')) : nil # we need a profile with an example of this and probably will need to use the API to accuratetly get this data
+ includes_end_date = item_text.scan(regex).count > 1
+ end_date = includes_end_date ? Date.parse(item_text.scan(regex)[0].join(' ')) : nil # we need a profile with an example of this and probably will need to use the API to accuratetly get this data
- certifications << { name:name, authority:authority, license:license, start_date:start_date, end_date:end_date }
+ certifications << { name:name, authority:authority, license:license, start_date:start_date, end_date:end_date }
+ rescue => e
+ end
end
return certifications
end
+
end
def get_organizations(page)
organizations = []
# if the profile contains org data
if page.search('ul.organizations li.organization').first
# loop over each element with org data
page.search('ul.organizations li.organization').each do |item|
- # find the h3 element within the above section and get the text with excess white space stripped
- name = item.search('h3').text.gsub(/\s+|\n/, " ").strip
- position = nil # add this later
- occupation = nil # add this latetr too, this relates to the experience/work
- start_date = Date.parse(item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').first)
- if item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').last == 'Present'
- end_date = nil
- else
- Date.parse(item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').last)
- end
+ begin
+ # find the h3 element within the above section and get the text with excess white space stripped
+ name = item.search('h3').text.gsub(/\s+|\n/, " ").strip
+ position = nil # add this later
+ occupation = nil # add this latetr too, this relates to the experience/work
+ start_date = Date.parse(item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').first)
+ if item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').last == 'Present'
+ end_date = nil
+ else
+ Date.parse(item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').last)
+ end
- organizations << { name: name, start_date: start_date, end_date: end_date }
+ organizations << { name: name, start_date: start_date, end_date: end_date }
+ rescue => e
+ end
end
-
- return organizations
- end # page.search('ul.organizations li.organization').first
+ end
+ return organizations
end
+
def get_recommended_visitors(page)
recommended_vs=[]
if page.search(".browsemap").first
page.at(".browsemap").at("ul").search("li").each do |visitor|
v = {}
@@ -293,7 +307,8 @@
recommended_vs << v
end
return recommended_vs
end
end
+
end
end