parscit/lib/ParsCit/ in biblicit-2.0.3 vs parscit/lib/ParsCit/ in biblicit-2.0.4
- old
+ new
@@ -19,12 +19,10 @@
use ParsCit::Config;
use ParsCit::Tr2crfpp;
use ParsCit::PreProcess;
use ParsCit::PostProcess;
use ParsCit::CitationContext;
-# Omnipage libraries
-use Omni::Omnidoc;
# Dependencies
use CSXUtil::SafeText qw(cleanXML);
# Main API method for generating an XML document including
@@ -226,66 +224,10 @@
# Reference text, boby text, and normalize body text
my ($rcite_text, $rnorm_body_text, $rbody_text) = undef;
# Reference to an array of single reference
my $rraw_citations = undef;
- # Find and separate reference
- if ($is_xml)
- {
- ###
- # Huydhn: input is xml from Omnipage
- ###
- if (! open(IN, "<:utf8", $orgfile)) { return (-1, "Could not open xml file " . $orgfile . ": " . $!); }
- my $xml = do { local $/; <IN> };
- close IN;
- ###
- # Huydhn
- # NOTE: the omnipage xml is not well constructed (concatenated multiple xml files).
- # This merged xml need to be fixed first before pass it to xml processing libraries, e.g. xml::twig
- ###
- # Convert to Unix format
- $xml =~ s/\r//g;
- # Remove <?xml version="1.0" encoding="UTF-8"?>
- $xml =~ s/<\?xml.+?>\n//g;
- # Remove <!--XML document generated using OCR technology from ScanSoft, Inc.-->
- $xml =~ s/<\!\-\-XML.+?>\n//g;
- # Declaration and root
- $xml = "<?xml version=\"1.0\"?>" . "\n" . "<root>" . "\n" . $xml . "\n" . "</root>";
- # New document
- my $doc = new Omni::Omnidoc();
- $doc->set_raw($xml);
- # Extract the reference portion from the XML
- my ($start_ref, $end_ref, $rcite_text_from_xml, $rcit_addrs) = ParsCit::PreProcess::FindCitationTextXML($doc);
- # Extract the reference portion from the text.
- my $content = $doc->get_content();
- ($rcite_text, $rnorm_body_text, $rbody_text) = ParsCit::PreProcess::FindCitationText(\$content, \@pos_array);
- my @norm_body_tokens = split(/\s+/, $$rnorm_body_text);
- my @body_tokens = split(/\s+/, $$rbody_text);
- my $size = scalar(@norm_body_tokens);
- my $size1 = scalar(@pos_array);
- if($size != $size1) { die "ParsCit::Controller::extractCitationsImpl: normBodyText size $size != posArray size $size1\n"; }
- # Filename initialization
- if ($bwrite_split > 0) { ($citefile, $bodyfile) = WriteSplit($textfile, $rcite_text_from_xml, $rbody_text); }
- # Prepare to split unmarked reference portion
- my $tmp_file = ParsCit::Tr2crfpp::PrepDataUnmarked($doc, $rcit_addrs);
- # Extract citations from citation text
- $rraw_citations = ParsCit::PreProcess::SegmentCitationsXML($rcite_text_from_xml, $tmp_file);
- }
- else
- {
if (! open(IN, "<:utf8", $textfile)) { return (-1, "Could not open text file " . $textfile . ": " . $!); }
my $text = do { local $/; <IN> };
close IN;
@@ -307,10 +249,9 @@
# Filename initialization
if ($bwrite_split > 0) { ($citefile, $bodyfile) = WriteSplit($textfile, $rcite_text, $rbody_text); }
# Extract citations from citation text
$rraw_citations = ParsCit::PreProcess::SegmentCitations($rcite_text);
- }
my @citations = ();
my @valid_citations = ();
# Process each citation