lib/arxiv.rb in arxiv-0.0.4 vs lib/arxiv.rb in arxiv-0.0.5
- old
+ new
@@ -15,35 +15,65 @@
module Error
class ManuscriptNotFound < StandardError ; end
class MalformedId < StandardError ; end
end
- ID_FORMAT = /^\d{4}\.\d{4}(?:v\d+)?$/
+ # In 2007, the ArXiv API changed document ID formats:
+ #
+ # http://arxiv.org/abs/math/0510097v1 (legacy)
+ # http://arxiv.org/abs/1202.0819v1 (current)
+ #
+ # These constants help us deal with both use cases.
+ #
+ LEGACY_URL_FORMAT = /[^\/]+\/\d+(?:v\d+)?$/
+ CURRENT_URL_FORMAT = /\d{4}\.\d{4}(?:v\d+)?$/
- def self.get(id)
+ LEGACY_ID_FORMAT = /^#{LEGACY_URL_FORMAT}/
+ ID_FORMAT = /^#{CURRENT_URL_FORMAT}/
- id = parse_arxiv_id(id)
+ def self.get(identifier)
- raise Arxiv::Error::MalformedId, "Manuscript ID format is invalid" unless id =~ ID_FORMAT
+ id = parse_arxiv_identifier(identifier)
+ unless id =~ ID_FORMAT || id =~ LEGACY_ID_FORMAT
+ raise Arxiv::Error::MalformedId, "Manuscript ID format is invalid"
+ end
+
url = ::URI.parse("http://export.arxiv.org/api/query?id_list=#{id}")
response = ::Nokogiri::XML(open(url)).remove_namespaces!
manuscript = Arxiv::Manuscript.parse(response.to_s, single: id)
raise Arxiv::Error::ManuscriptNotFound, "Manuscript #{id} doesn't exist on arXiv" if manuscript.title.nil?
manuscript
end
private
- def self.parse_arxiv_id(id)
- if id =~ ID_FORMAT
- id
- elsif id =~ /arxiv.org/
- match = id.match(/[^\/]+$/)
- match[0] if match
+ def self.parse_arxiv_identifier(identifier)
+ if valid_id?(identifier)
+ identifier
+ elsif valid_url?(identifier)
+ format = legacy_url?(identifier) ? LEGACY_URL_FORMAT : CURRENT_URL_FORMAT
+ identifier.match(/(#{format})/)[1]
else
- id
+ identifier # probably an error
end
end
+
+ def self.valid_id?(identifier)
+ identifier =~ ID_FORMAT || identifier =~ LEGACY_ID_FORMAT
+ end
+
+ def self.valid_url?(identifier)
+ identifier =~ LEGACY_URL_FORMAT || identifier =~ CURRENT_URL_FORMAT
+ end
+
+ def self.legacy_url?(identifier)
+ identifier =~ LEGACY_URL_FORMAT
+ end
+
+
+
+
+
end