lib/linkedin-scraper/profile.rb in linkedin-scraper-0.0.7 vs lib/linkedin-scraper/profile.rb in linkedin-scraper-0.0.8
- old
+ new
@@ -1,134 +1,78 @@
-USER_AGENTS = ["Windows IE 6", "Windows IE 7", "Windows Mozilla", "Mac Safari", "Mac FireFox", "Mac Mozilla", "Linux Mozilla", "Linux Firefox", "Linux Konqueror"]
+# -*- coding: utf-8 -*-
module Linkedin
- class Profile
- #the First name of the contact
- attr_accessor :first_name,:last_name,:title,:location,:country,
- :industry, :linkedin_url,:recommended_visitors,:page
- #Array of hashes for eduction
- # [
- # [0] {
- # :name => "Vishwakarma Institute of Technology",
- # :description => "B.Tech, Computer Engineering",
- # :period => "2007 – 2011"
- # },
- # [1] {
- # :name => "St Ursula's High School",
- # :description => "Secondary School Education",
- # :period => nil
- # }
- # ]
- attr_accessor :education
+ class Profile
- #Array of websites
- #[
- #[0] "http://www.yatishmehta.in"
- #]
- attr_accessor :websites
- #array of hashes containing group name and link
- # [
- # [ 0] {
- # :name => "Business on Rails",
- # :link => "http://www.linkedin.com/groups/Business-on-Rails-27822"
- # },
- # [ 1] {
- # :name => "HTML5 Technologies",
- # :link => "http://www.linkedin.com/groups/HTML5-Technologies-2868882"
- # },
- # [ 2] {
- # :name => "India on Rails",
- # :link => "http://www.linkedin.com/groups/India-on-Rails-149940"
- # :name => "Open Source",
- # :link => "http://www.linkedin.com/groups?gid=43875"
- # },
- # [ 4] {
- # :name => "Rails Developers",
- # :link => "http://www.linkedin.com/groups?gid=77764"
- # },
- # ]
- attr_accessor:groups
+ USER_AGENTS = ["Windows IE 6", "Windows IE 7", "Windows Mozilla", "Mac Safari", "Mac FireFox", "Mac Mozilla", "Linux Mozilla", "Linux Firefox", "Linux Konqueror"]
+ attr_accessor :first_name,:last_name,:title,:location,:country, :industry,:picture,:linkedin_url,:recommended_visitors,:page
- #Array of hash containing its past job companies and job profile
- #Example
- # [
- # [0] {
- # :past_company => "Consumyze Software",
- # :past_title => "Trainee",
- # :past_company_website => "http://www.consumyze.com",
- # :description => "Responsible for design and development"
- # },
- # [1] {
- # :past_company => "SunGard Global Services",
- # :past_title => "Project Intern",
- # :past_company_website => "http://www.sungard.com/globalservices/learnmore",
- # :description => "Fame PassPoint. Developed an entirely Ajax based online control panel for user management and Data access for Fame"
- # }
- # ]
+ attr_accessor :education
+ attr_accessor :websites
+ attr_accessor:groups
+
attr_accessor :past_companies
- #Array of hash containing its current job companies and job profile
- #Example
- # [
- # [0] {
- # :current_title => "Intern",
- # :current_company => "Sungard"
- # :current_company_url=>"http://www.betterlabs.net",
- # :description=>"Responsible for design and development of projects on Ruby on Rails."
- # },
- # [1] {
- # :current_title => "Software Developer",
- # :current_company => "Microsoft"
- # :current_company_url =>"http://www.microsoft.net",
- # :description =>"Development and design"
- # }
- # ]
attr_accessor :current_companies
- #url of the profile
+ attr_accessor :skills
- def initialize(page,url)
- @first_name=get_first_name(page)
- @last_name=get_last_name(page)
- @title=get_title(page)
- @location=get_location(page)
- @country=get_country(page)
- @industry=get_industry(page)
- @current_companies=get_current_companies page
- @past_companies=get_past_companies page
- @recommended_visitors=get_recommended_visitors page
- @education=get_education page
- @linkedin_url=url
- @websites=get_websites page
- @groups=get_groups page
- @page=page
+ def initialize(page,url)
+ @first_name = get_first_name(page)
+ @last_name = get_last_name(page)
+ @title = get_title(page)
+ @location = get_location(page)
+ @country = get_country(page)
+ @industry = get_industry(page)
+ @picture = get_picture(page)
+ @current_companies = get_current_companies(page)
+ @past_companies = get_past_companies(page)
+ @recommended_visitors = get_recommended_visitors(page)
+ @education = get_education(page)
+ @linkedin_url = url
+ @websites = get_websites(page)
+ @groups = get_groups(page)
+ @skills = get_skills(page)
+ @page = page
end
#returns:nil if it gives a 404 request
- def self.get_profile url
+ def self.get_profile(url)
begin
- @agent=Mechanize.new
+ @agent = Mechanize.new
@agent.user_agent_alias = USER_AGENTS.sample
@agent.max_history = 0
- page=@agent.get url
+ page = @agent.get(url)
return Linkedin::Profile.new(page, url)
rescue=>e
puts e
end
end
- def get_company_url node
+ def get_skills(page)
+ page.search('.competency.show-bean').map{|skill|skill.text.strip if skill.text}
+ end
+
+ def get_company_url(node)
+ result={}
if node.at("h4/strong/a")
- link=node.at("h4/strong/a")["href"]
- @agent=Mechanize.new
+ link = node.at("h4/strong/a")["href"]
+ @agent = Mechanize.new
@agent.user_agent_alias = USER_AGENTS.sample
@agent.max_history = 0
- page=@agent.get("http://www.linkedin.com"+link)
- url=page.at(".basic-info/div/dl/dd/a").text if page.at(".basic-info/div/dl/dd/a")
+ page = @agent.get("http://www.linkedin.com"+link)
+ result[:linkedin_company_url] = "http://www.linkedin.com"+link
+ result[:url] = page.at(".basic-info/div/dl/dd/a").text if page.at(".basic-info/div/dl/dd/a")
+ node_2 = page.at(".basic-info").at(".content.inner-mod")
+ node_2.search("dd").zip(node_2.search("dt")).each do |value,title|
+ result[title.text.gsub(" ","_").downcase.to_sym] = value.text.strip
+ end
+ result[:address] = page.at(".vcard.hq").at(".adr").text.gsub("\n"," ").strip if page.at(".vcard.hq")
end
+ result
end
private
def get_first_name page
@@ -153,92 +97,95 @@
def get_industry page
return page.at(".industry").text.gsub(/\s+/, " ").strip if page.search(".industry").first
end
+ def get_picture page
+ return page.at("#profile-picture/img.photo").attributes['src'].value.strip if page.search("#profile-picture/img.photo").first
+ end
+
def get_past_companies page
past_cs=[]
if page.search(".position.experience.vevent.vcard.summary-past").first
page.search(".position.experience.vevent.vcard.summary-past").each do |past_company|
- url=get_company_url past_company
- title=past_company.at("h3").text.gsub(/\s+|\n/, " ").strip if past_company.at("h3")
- company=past_company.at("h4").text.gsub(/\s+|\n/, " ").strip if past_company.at("h4")
- description=past_company.at(".description.past-position").text.gsub(/\s+|\n/, " ").strip if past_company.at(".description.past-position")
- past_company={:past_company=>company,:past_title=> title,:past_company_website=>url,:description=>description}
- past_cs<<past_company
+ result = get_company_url past_company
+ url = result[:url]
+ title = past_company.at("h3").text.gsub(/\s+|\n/, " ").strip if past_company.at("h3")
+ company = past_company.at("h4").text.gsub(/\s+|\n/, " ").strip if past_company.at("h4")
+ description = past_company.at(".description.past-position").text.gsub(/\s+|\n/, " ").strip if past_company.at(".description.past-position")
+ p_company = {:past_company=>company,:past_title=> title,:past_company_website=>url,:description=>description}
+ p_company = p_company.merge(result)
+ past_cs << p_company
end
return past_cs
end
end
def get_current_companies page
- current_cs=[]
+ current_cs = []
if page.search(".position.experience.vevent.vcard.summary-current").first
page.search(".position.experience.vevent.vcard.summary-current").each do |current_company|
- url=get_company_url current_company
- title=current_company.at("h3").text.gsub(/\s+|\n/, " ").strip if current_company.at("h3")
- company=current_company.at("h4").text.gsub(/\s+|\n/, " ").strip if current_company.at("h4")
- description=current_company.at(".description.current-position").text.gsub(/\s+|\n/, " ").strip if current_company.at(".description.current-position")
- current_company={:current_company=>company,:current_title=> title,:current_company_url=>url,:description=>description}
- current_cs<<current_company
+ result = get_company_url current_company
+ url = result[:url]
+ title = current_company.at("h3").text.gsub(/\s+|\n/, " ").strip if current_company.at("h3")
+ company = current_company.at("h4").text.gsub(/\s+|\n/, " ").strip if current_company.at("h4")
+ description = current_company.at(".description.current-position").text.gsub(/\s+|\n/, " ").strip if current_company.at(".description.current-position")
+ current_company = {:current_company=>company,:current_title=> title,:current_company_url=>url,:description=>description}
+ current_cs << current_company.merge(result)
end
return current_cs
end
end
- def get_education page
+ def get_education(page)
education=[]
if page.search(".position.education.vevent.vcard").first
page.search(".position.education.vevent.vcard").each do |item|
- name=item.at("h3").text.gsub(/\s+|\n/, " ").strip if item.at("h3")
- desc=item.at("h4").text.gsub(/\s+|\n/, " ").strip if item.at("h4")
- period=item.at(".period").text.gsub(/\s+|\n/, " ").strip if item.at(".period")
- edu={:name=>name,:description=>desc,:period=>period}
- education<<edu
+ name = item.at("h3").text.gsub(/\s+|\n/, " ").strip if item.at("h3")
+ desc = item.at("h4").text.gsub(/\s+|\n/, " ").strip if item.at("h4")
+ period = item.at(".period").text.gsub(/\s+|\n/, " ").strip if item.at(".period")
+ edu = {:name => name,:description => desc,:period => period}
+ education << edu
end
return education
end
end
- def get_websites page
+ def get_websites(page)
websites=[]
if page.search(".website").first
page.search(".website").each do |site|
- url=site.at("a")["href"]
- url="http://www.linkedin.com"+url
- url=CGI.parse(URI.parse(url).query)["url"]
- websites<<url
+ url = site.at("a")["href"]
+ url = "http://www.linkedin.com"+url
+ url = CGI.parse(URI.parse(url).query)["url"]
+ websites << url
end
return websites.flatten!
- end
+ end
end
- def get_groups page
- groups=[]
+ def get_groups(page)
+ groups = []
if page.search(".group-data").first
page.search(".group-data").each do |item|
- name=item.text.gsub(/\s+|\n/, " ").strip
- link="http://www.linkedin.com"+item.at("a")["href"]
- groups<<{:name=>name,:link=>link}
+ name = item.text.gsub(/\s+|\n/, " ").strip
+ link = "http://www.linkedin.com"+item.at("a")["href"]
+ groups << {:name=>name,:link=>link}
end
return groups
end
-
end
-
-
-
- def get_recommended_visitors page
+ def get_recommended_visitors(page)
recommended_vs=[]
if page.search(".browsemap").first
page.at(".browsemap").at("ul").search("li").each do |visitor|
- v={}
- v[:link]=visitor.at('a').attributes["href"]
- v[:name]=visitor.at('a').text
- v[:title]=visitor.at('.headline').text.split(" at ").first
- v[:company]=visitor.at('.headline').text.split(" at ").last
- recommended_vs<<v
+ v = {}
+ v[:link] = visitor.at('a')["href"]
+ v[:name] = visitor.at('strong/a').text
+ v[:title] = visitor.at('.headline').text.gsub("..."," ").split(" at ").first
+ v[:company] = visitor.at('.headline').text.gsub("..."," ").split(" at ")[1]
+ recommended_vs << v
end
return recommended_vs
end
end
end