lib/bolognese/readers/schema_org_reader.rb in bolognese-0.9.91 vs lib/bolognese/readers/schema_org_reader.rb in bolognese-0.9.92
- old
+ new
@@ -14,11 +14,14 @@
return { "string" => nil, "state" => "not_found" } unless id.present?
id = normalize_id(id)
response = Maremma.get(id)
doc = Nokogiri::XML(response.body.fetch("data", nil), nil, 'UTF-8')
- string = doc.at_xpath('//script[@type="application/ld+json"]')
+ #string = doc.at_xpath('//script[@type="application/ld+json"]')
+ # workaround for xhtml documents
+ nodeset = doc.css("script")
+ string = nodeset.find { |element| element["type"] == "application/ld+json" }
string = string.text if string.present?
{ "string" => string }
end
@@ -29,19 +32,30 @@
end
meta = string.present? ? Maremma.from_json(string) : {}
id = normalize_id(meta.fetch("@id", nil) || options[:id])
- type = meta.fetch("@type", nil)
+ type = meta.fetch("@type", nil) && meta.fetch("@type").camelcase
resource_type_general = Bolognese::Utils::SO_TO_DC_TRANSLATIONS[type]
- author = get_authors(from_schema_org(Array.wrap(meta.fetch("author", nil))))
+ authors = meta.fetch("author", nil) || meta.fetch("creator", nil)
+ author = get_authors(from_schema_org(Array.wrap(authors)))
editor = get_authors(from_schema_org(Array.wrap(meta.fetch("editor", nil))))
publisher = if meta.dig("publisher").is_a?(Hash)
meta.dig("publisher", "name")
elsif publisher.is_a?(String)
meta.dig("publisher")
end
+
+ included_in_data_catalog = from_schema_org(Array.wrap(meta.fetch("includedInDataCatalog", nil)))
+ included_in_data_catalog = Array.wrap(included_in_data_catalog).map { |dc| { "title" => dc["name"], "url" => dc["url"] } }
+ is_part_of = schema_org_is_part_of(meta) || included_in_data_catalog
+
+ license = {
+ "id" => parse_attributes(meta.fetch("license", nil), content: "id", first: true),
+ "name" => parse_attributes(meta.fetch("license", nil), content: "name", first: true)
+ }
+
date_published = meta.fetch("datePublished", nil)
state = meta.present? ? "findable" : "not_found"
{ "id" => id,
"type" => type,
@@ -56,20 +70,20 @@
"alternate_name" => meta.fetch("alternateName", nil),
"author" => author,
"publisher" => meta.dig("publisher", "name"),
"service_provider" => meta.fetch("provider", nil),
"is_identical_to" => schema_org_is_identical_to(meta),
- "is_part_of" => schema_org_is_part_of(meta),
+ "is_part_of" => is_part_of,
"has_part" => schema_org_has_part(meta),
"references" => schema_org_references(meta),
"is_referenced_by" => schema_org_is_referenced_by(meta),
"is_supplement_to" => schema_org_is_supplement_to(meta),
"is_supplemented_by" => schema_org_is_supplemented_by(meta),
"date_created" => meta.fetch("dateCreated", nil),
"date_published" => date_published,
"date_modified" => meta.fetch("dateModified", nil),
"description" => meta.fetch("description", nil).present? ? { "text" => sanitize(meta.fetch("description")) } : nil,
- "license" => { "id" => meta.fetch("license", nil) },
+ "license" => license,
"b_version" => meta.fetch("version", nil),
"keywords" => meta.fetch("keywords", nil).to_s.split(", "),
"state" => state
}
end