lib/relaton_ieee/scrapper.rb in relaton-ieee-1.9.4 vs lib/relaton_ieee/scrapper.rb in relaton-ieee-1.10.0
- old
+ new
@@ -4,44 +4,43 @@
# rubocop:disable Metrics/MethodLength, Metrics/AbcSize
# papam hit [Hash]
# @return [RelatonOgc::OrcBibliographicItem]
def parse_page(hit)
- doc = Nokogiri::HTML Faraday.get(hit["recordURL"]).body
+ doc = Nokogiri::HTML Faraday.get(hit[:url]).body
IeeeBibliographicItem.new(
fetched: Date.today.to_s,
- title: fetch_title(hit["recordTitle"]),
- docid: fetch_docid(hit["recordTitle"]),
- link: fetch_link(hit["recordURL"]),
+ title: fetch_title(doc),
+ docid: fetch_docid(hit[:ref]),
+ link: fetch_link(hit[:url]),
docstatus: fetch_status(doc),
abstract: fetch_abstract(doc),
contributor: fetch_contributor(doc),
language: ["en"],
script: ["Latn"],
date: fetch_date(doc),
- committee: fetch_committee(doc)
+ committee: fetch_committee(doc),
)
end
# rubocop:enable Metrics/MethodLength, Metrics/AbcSize
private
- # @param title [String]
+ # @param doc [String] Nokogiri::HTML4::Document
# @return [Array<RelatonBib::TypedTitleString>]
- def fetch_title(title)
- [
+ def fetch_title(doc)
+ doc.xpath("//h2[@id='stnd-title']").map do |t|
RelatonBib::TypedTitleString.new(
- type: "main", content: title, language: "en", script: "Latn"
- ),
- ]
+ type: "main", content: t.text, language: "en", script: "Latn",
+ )
+ end
end
- # @param title [String]
+ # @param ref [String]
# @return [Array<RelatonBib::DocumentIdentifier>]
- def fetch_docid(title)
- /^(?<identifier>(?:\w+\s)?\S+)/ =~ title
- [RelatonBib::DocumentIdentifier.new(id: identifier, type: "IEEE")]
+ def fetch_docid(ref)
+ [RelatonBib::DocumentIdentifier.new(id: ref, type: "IEEE")]
end
# @param url [String]
# @return [Array>RelatonBib::TypedUri>]
def fetch_link(url)
@@ -49,14 +48,14 @@
end
# @param doc [Nokogiri::HTML::Document]
# @return [RelatonBib::DocumentStatus, NilClass]
def fetch_status(doc)
- stage = doc.at("//td[.='Status']/following-sibling::td/div")
+ stage = doc.at("//dd[@id='stnd-status']")
return unless stage
- RelatonBib::DocumentStatus.new(stage: stage.text)
+ RelatonBib::DocumentStatus.new(stage: stage.text.split.first)
end
# @param identifier [String]
# @return [String]
# def fetch_edition(identifier)
@@ -65,37 +64,34 @@
# end
# @param doc [Nokogiri::HTML::Document]
# @return [Array<RelatonBib::FormattedString>]
def fetch_abstract(doc)
- content = doc.at("//div[@class='description']")
- return [] unless content
-
- [RelatonBib::FormattedString.new(content: content.text, language: "en",
- script: "Latn")]
+ doc.xpath("//div[@id='stnd-description']").map do |a|
+ RelatonBib::FormattedString.new(
+ content: a.text.strip, language: "en", script: "Latn",
+ )
+ end
end
# @param doc [Nokogiri::HTML::Document]
# @return [Array<RelatonBib::ContributionInfo>]
def fetch_contributor(doc)
- name = doc.at(
- "//td[.='IEEE Program Manager']/following-sibling::td/div/a"
- )
- return [] unless name
-
- [personn_contrib(name.text)]
+ doc.xpath("//dd[@id='stnd-staff-liaison']/text()").map do |name|
+ person_contrib(name.text.strip)
+ end
end
# @param name [String]
# @return [RelatonBib::ContributionInfo]
- def personn_contrib(name)
+ def person_contrib(name)
fname = RelatonBib::FullName.new(
- completename: RelatonBib::LocalizedString.new(name)
+ completename: RelatonBib::LocalizedString.new(name),
)
entity = RelatonBib::Person.new(name: fname)
RelatonBib::ContributionInfo.new(
- entity: entity, role: [type: "author"]
+ entity: entity, role: [type: "author"],
)
end
# @param name [String]
# @return [RelatonBib::ContributionInfo]
@@ -110,46 +106,43 @@
# @param date [Nokogiri::HTML::Document]
# @return [Array<RelatonBib::BibliographicDate>]
def fetch_date(doc)
dates = []
- issued = doc.at "//td[.='Board Approval']/following-sibling::td/div"
- if issued
- dates << RelatonBib::BibliographicDate.new(type: "issued",
- on: issued.text)
+ id = doc.at "//dd[@id='stnd-approval-date']"
+ if id
+ dates << RelatonBib::BibliographicDate.new(type: "issued", on: id.text)
end
- published = doc.at("//td[.='History']/following-sibling::td/div")
- &.text&.match(/(?<=Published Date:)[\d-]+/)&.to_s
- if published
- dates << RelatonBib::BibliographicDate.new(type: "published",
- on: published)
+ pd = doc.at("//dd[@id='stnd-published-date']")
+ if pd
+ dates << RelatonBib::BibliographicDate.new(type: "published", on: pd.text)
end
dates
end
# rubocop:disable Metrics/AbcSize
# @param doc [Nokogiri::HTML::Document]
# @return [Array<RelatonIeee::Committee>]
def fetch_committee(doc)
committees = []
- sponsor = doc.at "//td[.='Sponsor Committee']/following-sibling::td/div"
+ sponsor = doc.at "//dd[@id='stnd-committee']/text()"
if sponsor
- committees << Committee.new(type: "sponsor", name: sponsor.text)
+ committees << Committee.new(type: "sponsor", name: sponsor.text.strip)
end
sponsor = doc.at "//td[.='Standards Committee']/following-sibling::td/div/a"
if sponsor
committees << Committee.new(type: "standard", name: sponsor.text)
end
- working = doc.at "//td[.='Working Group']/following-sibling::td/div"
- chair = doc.at "//td[.='Working Group Chair']/following-sibling::td/div"
+ working = doc.at "//dd[@id='stnd-working-group']/text()"
if working
- committees << Committee.new(type: "working", name: working.text,
+ chair = doc.at "//dd[@id='stnd-working-group-chair']"
+ committees << Committee.new(type: "working", name: working.text.strip,
chair: chair.text)
end
- society = doc.at "//td[.='Society']/following-sibling::td/div"
+ society = doc.at "//dd[@id='stnd-society']/text()"
if society
- committees << Committee.new(type: "society", name: society.text)
+ committees << Committee.new(type: "society", name: society.text.strip)
end
committees
end
# rubocop:enable Metrics/MethodLength, Metrics/AbcSize
end