require 'open-uri' require 'digest/sha2' require 'time' require 'fileutils' require 'pty' require 'expect' require 'pdf-reader' require 'nokogiri' require 'json' module ArxivUtil BASE_URL = "https://arxiv.org" REFERENCE_START_REGEXP = Regexp.new('References|REFERENCES|Reference|REFERENCE') REFERENCE_REGEXP = Regexp.new('(\[[0-9]?[0-9]\]|\[.+?\])') def self.makeId return Digest::SHA256.hexdigest Time.now.strftime("%F %H:%M:%S") end def self.makeDir(id, work_dir) Dir.mkdir("#{work_dir}/#{id}") end def self.removeDir(id, work_dir) FileUtils.rm_rf("#{work_dir}/#{id}") end def self.makeFile(id, work_dir, use_dir) if use_dir return "#{work_dir}/#{id}/output.pdf" else return "#{work_dir}/#{id}-output.pdf" end end def self. getK2Pdf(id, work_dir, use_dir) if use_dir return "#{work_dir}/#{id}/output_k2opt.pdf" else return "#{work_dir}/#{id}-output_k2opt.pdf" end end def removeFile(id, work_dir) File.delete("#{work_dir}/#{id}-output.pdf") File.delete("#{work_dir}/#{id}-output_k2opt.pdf") end def self.fetchFromUrl(urlName, work_dir, use_dir) puts "fetch => #{urlName}" charset = nil html = open(urlName) do |f| charset = f.charset f.read end page = Nokogiri::HTML.parse(html, nil, charset) result = {} result[:title] = page.xpath('//*[@id="abs"]/div[2]/h1').text result[:authors] = page.xpath('//*[@id="abs"]/div[2]/div[2]/a').text result[:abstruct] = page.xpath('//*[@id="abs"]/div[2]/blockquote').text result[:pdfurl] = "#{BASE_URL}#{page.xpath('//*[@id="abs"]/div[1]/div[1]/ul/li[1]/a').attr('href').value}" result[:references] = fetchFromPdfUrl(result[:pdfurl], work_dir, use_dir) return result.to_json end def self.fetchFromArxivId(id, work_dir, use_dir) target_url = "#{BASE_URL}/abs/#{id}" fetchFromUrl(target_url, work_dir, use_dir) end def self.fetchPdfFile(pdfUrl,file_name) open(file_name, 'wb') do |o| open(pdfUrl) do |data| o.write(data.read) end end end def self.convertSingleColPdf(job_id, work_dir,file_name, use_dir) cmd = "k2pdfopt -dev kpw #{file_name}" PTY.spawn(cmd) do |i,o| o.sync = true i.expect(/\S.*Enter option above \(h=help, q=quit\):/,10){ o.puts "\n" o.flush } while( i.eof? == false ) res = i.gets print res break unless res.index('written').nil? end end return getK2Pdf(job_id, work_dir, use_dir) end def self.fetchReference(file_name) reader = PDF::Reader.new(file_name) page_no = reader. pages. reject{|i| i.text.index(REFERENCE_START_REGEXP).nil? }. map(&:number). sort. shift puts "Detect References page=> #{page_no} " ref_page = reader. pages. select{|i| i.number >= page_no }. map{|i| i.text.gsub(/\n+/,"\n").gsub(/ +/,' ') }. join(' '). gsub(REFERENCE_REGEXP,"\n\\1"). gsub('- ',''). split("\n") return ref_page[(ref_page.index{|i| i =~ REFERENCE_START_REGEXP}+1)..ref_page.length]. select{|i| i.length > 5 } end def self.fetchFromPdfUrl(pdfUrl, work_dir, use_dir) job_id = makeId makeDir(job_id, work_dir) unless use_dir file_name = makeFile(job_id, work_dir, use_dir) fetchPdfFile(pdfUrl, file_name) executed_pdf = convertSingleColPdf(job_id, work_dir, file_name, use_dir) references = fetchReference(executed_pdf) if use_dir removeDir(job_id, work_dir) else removeFile(job_id, work_dir) end return references end end