Sha256: 98301ddbd24c9063f0a7462fc8be43a089b946dd5f27a53b392df9c041bca401

Contents?: true

Size: 1.32 KB

Versions: 1

Compression:

Stored size: 1.32 KB

Contents

# encoding: UTF-8

require 'tmpdir'
require 'shellwords'
require 'nokogiri'

module ParsCit

  PERL_DIR = "#{File.dirname(__FILE__)}/../../parscit"

  def self.extract(in_file, opts={})
    ParseOperation.new(in_file, opts).result
  end

  class ParseOperation

    attr_reader :result

    def initialize(in_txt, opts={})
      ENV['CRFPP_HOME'] ||= "#{File.dirname(`which crf_test`)}/../"
      output = `#{PERL_DIR}/bin/citeExtract.pl -q -m extract_all #{in_txt.path}`
      @result = parse(Nokogiri::XML output)
    end

  private

    def parse(xml)
      parsed = xml.css("algorithm[name=ParsHed]")

      result = {
        title: parsed.css('title').text.gsub(/\s+/,' ').strip,
        authors: parsed.css('author').map { |a| a.text.gsub(/\s+/,' ').strip },
        abstract: parsed.css('abstract').text
      }

      citations = xml.css('algorithm[name=ParsCit] > citationList > citation').map do |node|
        cite = {
          authors: node.css('author').map(&:text).map(&:strip).reject(&:blank?).uniq
        }

        node.children.each do |child|
          unless ['contexts','authors','marker','rawString'].include?(child.name) || (text = child.text.strip).blank?
            cite[child.name.to_sym] = text
          end
        end

        cite
      end

      result[:citations] = citations
      result
    end

  end

end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
biblicit-2.1.0 lib/biblicit/parscit.rb