Sha256: 24b6a9f4893808238d7d0ce6c7c3a802d1d411ab3d3eba4590a2ef4ca00a9a12

Contents?: true

Size: 1.96 KB

Versions: 1

Compression:

Stored size: 1.96 KB

Contents

# encoding: UTF-8

require 'tmpdir'
require 'shellwords'
require 'nokogiri'

module ParsCit

  PERL_DIR = "#{File.dirname(__FILE__)}/../../parscit"

  def self.extract(in_file, opts={})
    ParseOperation.new(in_file, opts).result
  end

  class ParseOperation

    attr_reader :result

    def initialize(in_txt, opts={})
      mode = (opts.fetch :include_citations, false) ? 'extract_all' : 'extract_header'

      ENV['CRFPP_HOME'] ||= "#{File.dirname(`which crf_test`)}/../"
      ENV['PARSCIT_TMPDIR'] ||= "/tmp/"

      output = `#{PERL_DIR}/bin/citeExtract.pl -q -m #{mode} #{in_txt.path}`

      @result = parse(Nokogiri::XML output)
    end

  private

    def parse(xml)
      result = {}

      parshed = xml.css("algorithm[name=ParsHed]")
      result[:parshed] = {
        title: parshed.css('title').text.gsub(/\s+/,' ').strip,
        authors: parshed.css('author').map { |a| a.text.gsub(/\s+/,' ').strip },
        abstract: parshed.css('abstract').text#,
        #confidence: parshed.css('title').attr('confidence').value.to_f
      }

      svm = xml.css('algorithm[name="SVM HeaderParse"]')
      result[:citeseer] = {
        title: svm.css('title').text,
        authors: svm.css('author > name').map { |n| n.text.strip }.reject(&:blank?).uniq,
        author_emails: svm.css('author > email').map { |n| n.text.strip }.reject(&:blank?).uniq,
        abstract: svm.css('abstract').text,
        valid: svm.css('validHeader').first.try(:text) == '1'
      }

      citations = xml.css('algorithm[name=ParsCit] > citationList > citation').map do |node|
        cite = {
          authors: node.css('author').map(&:text).map(&:strip).reject(&:blank?).uniq
        }

        node.children.each do |child|
          unless ['contexts','authors','marker','rawString'].include?(child.name) || (text = child.text.strip).blank?
            cite[child.name.to_sym] = text
          end
        end

        cite
      end

      result[:citations] = citations
      result
    end

  end

end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
biblicit-2.2.3 lib/biblicit/parscit.rb