#!/usr/bin/perl # -*- cperl -*- #!/usr/bin/perl -CSD =head1 NAME citeExtract.pl =head1 SYNOPSYS RCS:$Id$ =head1 DESCRIPTION Simple command script for executing ParsCit in an offline mode (direct API call instead of going through the web service). =head1 HISTORY ORIGIN: created from templateApp.pl version 3.4 by Min-Yen Kan Min-Yen Kan, 15 Jul 2009. Minh-Thang Luong, 25 May 2009. Isaac Councill, 08/23/07 =cut require 5.0; use FindBin; use Getopt::Std; use strict 'vars'; use lib $FindBin::Bin . "/../lib"; #use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/5.10.0"; #use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/site_perl/5.10.0"; # Dependencies use File::Spec; use File::Basename; # Local libraries use ParsCit::Controller; use HeaderParse::API::Parser; use HeaderParse::Config::API_Config; # USER customizable section my $tmpfile .= $0; $tmpfile =~ s/[\.\/]//g; $tmpfile .= $$ . time; # Untaint tmpfile variable if ($tmpfile =~ /^([-\@\w.]+)$/) { $tmpfile = $1; } my $tmpdir = $ENV{'PARSCIT_TMPDIR'} || "/tmp"; $tmpfile = "$tmpdir/$tmpfile"; $0 =~ /([^\/]+)$/; my $progname = $1; my $PARSCIT = 1; my $PARSHED = 2; my $SECTLABEL = 4; # Thang v100401 my $SVM = 8; my $default_input_type = "raw"; my $output_version = "110505"; my $biblio_script = $FindBin::Bin . "/BiblioScript/biblio_script.sh"; my $default_mode = $PARSCIT; # END user customizable section # Ctrl-C handler sub quitHandler { print STDERR "\n# $progname fatal\t\tReceived a 'SIGINT'\n# $progname - exiting cleanly\n"; exit; } # HELP sub-procedure sub Help { print STDERR "usage: $progname -h\t\t\t\t[invokes help]\n"; print STDERR " $progname -v\t\t\t\t[invokes version]\n"; print STDERR " $progname [-qt] [-m ] [-i ] [-e ] [outfile]\n"; print STDERR "Options:\n"; print STDERR "\t-q\tQuiet Mode (don't echo license)\n"; # Thang v100401: add new mode (extract_section), and -i print STDERR "\t-m \tMode (extract_citations, extract_header, extract_section, extract_meta, extract_all, default: extract_citations)\n"; print STDERR "\t-i \tType (raw, xml, default: raw)\n"; print STDERR "\t-e \tExport citations into multiple types (ads|bib|end|isi|ris|wordbib). Multiple types could be specified by contatenating with \"-\" e.g., bib-end-ris. Output files will be named as outfile.exportFormat, with outfile being the input argument, and exportFormat being each individual format supplied by -e option.\n"; print STDERR "\t-t\tUse token level model instead\n"; print STDERR "\n"; print STDERR "Will accept input on STDIN as a single file.\n"; } # VERSION sub-procedure sub Version { if (system ("perldoc $0")) { die "Need \"perldoc\" in PATH to print version information"; } exit; } # MAIN program my $cmd_line = $0 . " " . join (" ", @ARGV); # Invoked with no arguments, error in execution if ($#ARGV == -1) { print STDERR "# $progname info\t\tNo arguments detected, waiting for input on command line. \n"; print STDERR "# $progname info\t\tIf you need help, stop this program and reinvoke with \"-h\". \n"; exit(-1); } $SIG{'INT'} = 'quitHandler'; getopts ('hqm:i:e:tva'); our ($opt_q, $opt_v, $opt_h, $opt_m, $opt_i, $opt_e, $opt_t, $opt_a); # Use (!defined $opt_X) for options with arguments if ($opt_v) { # call Version, if asked for Version(); exit(0); } if ($opt_h) { # call help, if asked for Help(); exit (0); } my $mode = (!defined $opt_m) ? $default_mode : ParseMode($opt_m); my $ph_model = (defined $opt_t) ? 1 : 0; my $in = shift; # input file my $out = shift; # if available # Output buffer my $rxml = "\n\n"; ### # Thang v100401: add input type option, and SectLabel ### my $is_xml_input = 0; if (defined $opt_i && $opt_i !~ /^(xml|raw)$/) { print STDERR "#! Input type needs to be either \"raw\" or \"xml\"\n"; Help(); exit (0); } elsif (defined $opt_i && $opt_i eq "xml") { $is_xml_input = 1; } ### # Thang v100901: add export type option & incorporate BibUtils ### my @export_types = (); if (defined $opt_e && $opt_e ne "") { # Sanity checks # No call to extract_citation if (($mode & $PARSCIT) != $PARSCIT) { print STDERR "#! Export type option is only available for the following modes: extract_citations, extract_meta and extract_all\n"; Help(); exit(0); } if (! defined $out) { print STDERR "#! Export type option requires output file name to be specified\n"; Help(); exit(0); } # Get individual export types my %type_hash = (); my @tokens = split(/\-/, $opt_e); foreach my $token (@tokens) { if($token !~ /^(ads|bib|end|isi|ris|wordbib)$/) { print STDERR "#! Invalid export type \"$token\"\n"; Help(); exit (0); } $type_hash{ $token } = 1; } # Get all export types sorted @export_types = sort { $a cmp $b } keys %type_hash; } my $doc = undef; my $text_file = $in; # SECTLABEL if (($mode & $SECTLABEL) == $SECTLABEL) { my $sect_label_input = $text_file; my ($sl_xml, $aut_lines, $aff_lines) = SectLabel($sect_label_input, $is_xml_input, 0); # Remove first line $rxml .= RemoveTopLines($sl_xml, 1) . "\n"; } # PARSHED if (($mode & $PARSHED) == $PARSHED) { use ParsHed::Controller; my $ph_xml = ParsHed::Controller::extractHeader($text_file, $ph_model); # Remove first line $rxml .= RemoveTopLines($$ph_xml, 1) . "\n"; } # PARSCIT if (($mode & $PARSCIT) == $PARSCIT) { my $pc_xml = ParsCit::Controller::ExtractCitations($text_file, $in, $is_xml_input); # Remove first line $rxml .= RemoveTopLines($$pc_xml, 1) . "\n"; # Thang v100901: call to BiblioScript if (scalar(@export_types) != 0) { BiblioScript(\@export_types, $$pc_xml, $out); } } # SVM HEADER PARSE if (($mode & $SVM) == $SVM) { my $svm_xml = HeaderParse::API::Parser::extractHeader($text_file); $rxml .= $$$svm_xml . "\n"; } $rxml .= ""; if (defined $out) { open (OUT, ">:utf8", $out) or die $progname . " fatal\tCould not open \"" . $out . "\" for writing: $!"; print OUT $rxml; close OUT; } else { print $rxml; } # Clean-up step if ($is_xml_input) { # PARSCIT if (($mode & $PARSCIT) == $PARSCIT) { # Get the normal .body .cite files system("mv $text_file.body $in.body"); system("mv $text_file.cite $in.cite"); } unlink($text_file); } # END of main program sub ParseMode { my $arg = shift; if ($arg eq "extract_meta") { return ($PARSCIT | $PARSHED); } elsif ($arg eq "extract_header") { return ($PARSHED | $SVM); } elsif ($arg eq "extract_citations") { return $PARSCIT; } elsif ($arg eq "extract_section") { return $SECTLABEL; } elsif ($arg eq "extract_all") { return ($PARSHED | $PARSCIT | $SECTLABEL | $SVM); } else { Help(); exit(-1); } } # Remove top n lines sub RemoveTopLines { my ($input, $top_n) = @_; # Remove first line my @lines = split (/\n/, $input); for(my $i = 0; $i < $top_n; $i++) { shift(@lines); } return join("\n", @lines); } ### # Thang v100401: generate section info ### sub SectLabel { my ($text_file, $is_xml_input, $for_parscit) = @_; use SectLabel::Config; use SectLabel::Controller; my $is_xml_output = 1; my $is_debug = 0; my $model_file = $is_xml_input ? $SectLabel::Config::modelXmlFile : $SectLabel::Config::modelFile; $model_file = "$FindBin::Bin/../$model_file"; my $dict_file = $SectLabel::Config::dictFile; $dict_file = "$FindBin::Bin/../$dict_file"; my $func_file = $SectLabel::Config::funcFile; $func_file = "$FindBin::Bin/../$func_file"; my $config_file = $is_xml_input ? $SectLabel::Config::configXmlFile : $SectLabel::Config::configFile; $config_file = "$FindBin::Bin/../$config_file"; # Classify section if (! $for_parscit) { my ($sl_xml, $aut_lines, $aff_lines) = SectLabel::Controller::ExtractSection( $text_file, $is_xml_output, $model_file, $dict_file, $func_file, $config_file, $is_xml_input, $is_debug, $for_parscit ); return ($$sl_xml, $aut_lines, $aff_lines); } # Huydhn: sectlabel output -> parscit input else { my ($all_text, $cit_lines) = SectLabel::Controller::ExtractSection( $text_file, $is_xml_output, $model_file, $dict_file, $func_file, $config_file, $is_xml_input, $is_debug, $for_parscit ); return ($all_text, $cit_lines); } } ### # Thang v100901: incorporate BiblioScript ### sub BiblioScript { my ($types, $pc_xml, $outfile) = @_; my @export_types = @{ $types }; my $base_tmp_dir = $ENV{'PARSCIT_TMPDIR'} || "/tmp"; my $tmp_dir = $base_tmp_dir . "/" . NewTmpFile(); system("mkdir -p $tmp_dir"); # Write extract_citation output to a tmp file my $filename = "$tmp_dir/input.txt"; open(OF, ">:utf8", $filename); print OF $pc_xml; close OF; # Call to BiblioScript my $size = scalar(@export_types); my $format = $export_types[0]; my $cmd = $biblio_script . " -q -i parscit -o " . $format . " " . $filename . " " . $tmp_dir; system($cmd); system("mv $tmp_dir/parscit.$format $outfile.$format"); # Reuse the MODS file generated in the first call for (my $i = 1; $i < $size; $i++) { $format = $export_types[$i]; $cmd = $biblio_script . " -q -i mods -o " . $format . " " . $tmp_dir . "/parscit_mods.xml " . $tmp_dir; system($cmd); system("mv $tmp_dir/parscit.$format $outfile.$format"); } system("rm -rf " . $tmp_dir); } # Method to generate tmp file name sub NewTmpFile { my $tmpfile = `date '+%Y%m%d-%H%M%S-$$'`; chomp $tmpfile; return $tmpfile; }