lib/arxiv/references/myUtil.rb in arxiv-references-0.1.6.5 vs lib/arxiv/references/myUtil.rb in arxiv-references-0.1.7.0
- old
+ new
@@ -5,10 +5,11 @@
require 'pty'
require 'expect'
require 'pdf-reader'
require 'nokogiri'
require 'json'
+
module ArxivUtil
BASE_URL = "https://arxiv.org"
REFERENCE_START_REGEXP = Regexp.new('References|REFERENCES|Reference|REFERENCE')
REFERENCE_REGEXP = Regexp.new('(\[[0-9]?[0-9]\]|\[.+?\])')
def self.makeId
@@ -43,30 +44,30 @@
File.delete("#{work_dir}/#{id}-output.pdf")
File.delete("#{work_dir}/#{id}-output_k2opt.pdf")
end
- def self.fetchFromUrl(urlName, work_dir, use_dir)
+ def self.fetchFromUrl(urlName, work_dir, use_dir, use_pdf)
puts "fetch => #{urlName}"
charset = nil
html = open(urlName) do |f|
charset = f.charset
f.read
end
page = Nokogiri::HTML.parse(html, nil, charset)
result = {}
- result[:title] = page.xpath('//*[@id="abs"]/div[2]/h1').text
- result[:authors] = page.xpath('//*[@id="abs"]/div[2]/div[2]/a').text
- result[:abstruct] = page.xpath('//*[@id="abs"]/div[2]/blockquote').text
+ result[:title] = page.xpath('//*[@id="abs"]/div[2]/h1').children.select{|i| i.name=='text'}.shift.text.gsub(/\n/,'')
+ result[:authors] = page.xpath('//*[@id="abs"]/div[2]/div[2]/a').map(&:text)
+ result[:abstruct] = page.xpath('//*[@id="abs"]/div[2]/blockquote').children.select{|i| i.name = 'text'}.reverse.shift.text
result[:pdfurl] = "#{BASE_URL}#{page.xpath('//*[@id="abs"]/div[1]/div[1]/ul/li[1]/a').attr('href').value}"
- result[:references] = fetchFromPdfUrl(result[:pdfurl], work_dir, use_dir)
- return result.to_json
+ result[:references] = fetchFromPdfUrl(result[:pdfurl], work_dir, use_dir) if use_pdf
+ return result
end
- def self.fetchFromArxivId(id, work_dir, use_dir)
+ def self.fetchFromArxivId(id, work_dir, use_dir, use_pdf)
target_url = "#{BASE_URL}/abs/#{id}"
- fetchFromUrl(target_url, work_dir, use_dir)
+ fetchFromUrl(target_url, work_dir, use_dir, use_pdf)
end
def self.fetchPdfFile(pdfUrl,file_name)
open(file_name, 'wb') do |o|
open(pdfUrl) do |data|