package ParsCit::PreProcess; ### # Utilities for finding and normalizing citations within # text files, including separating citation text from # body text and segmenting citations. # # Isaac Councill, 7/19/07 ### use utf8; use strict; use ParsCit::Citation; my %marker_types = ( 'SQUARE' => '\\[.+?\\]', 'PAREN' => '\\(.+?\\)', 'NAKEDNUM' => '\\d+', 'NAKEDNUMDOT' => '\\d+\\.', #'NAKEDNUM' => '\\d{1,3}', # Modified by Artemy Kolchinsky (v090625) #'NAKEDNUMDOT' => '\\d{1,3}\\.' # Modified by Artemy Kolchinsky (v090625) ); ### # Huydhn: similar to findCitationText, find the citation portion using regular expression. # However the input is an omnipage xml document object, not the raw text ### sub FindCitationTextXML { my ($doc) = @_; # Positions or addresses of all lines in the reference my @cit_addrs = (); # Start and end of a reference my $start_found = 0; my %start_ref = (); my $end_found = 0; my %end_ref = (); # All pages in the document my $pages = $doc->get_objs_ref(); # Foreach line in the document, check if it is the beginning of a reference using regular expression for (my $x = scalar(@{ $pages }) - 1; $x >= 0; $x--) { # All columns in one page my $columns = $pages->[ $x ]->get_objs_ref(); for (my $y = scalar(@{ $columns }) - 1; $y >= 0; $y--) { # All paragraphs in one column my $paras = $columns->[ $y ]->get_objs_ref(); for (my $z = scalar(@{ $paras }) - 1; $z >= 0; $z--) { # All lines in one paragraph my $lines = $paras->[ $z ]->get_objs_ref(); for (my $t = scalar(@{ $lines }) - 1; $t >= 0; $t--) { my $ln_content = $lines->[ $t ]->get_content(); # Is it the beginning of a reference if ($ln_content =~ m/\b(References?|REFERENCES?|Bibliography|BIBLIOGRAPHY|References?\s+and\s+Notes?|References?\s+Cited|REFERENCES?\s+CITED|REFERENCES?\s+AND\s+NOTES?|LITERATURE?\s+CITED?):?\s*$/) { if (($t + 1) < scalar(@{ $lines })) { $start_ref{ 'L4' } = $t + 1; $start_ref{ 'L3' } = $z; $start_ref{ 'L2' } = $y; $start_ref{ 'L1' } = $x; } elsif (($z + 1) < scalar(@{ $paras })) { $start_ref{ 'L4' } = 0; $start_ref{ 'L3' } = $z + 1; $start_ref{ 'L2' } = $y; $start_ref{ 'L1' } = $x; } elsif (($y + 1) < scalar(@{ $columns })) { $start_ref{ 'L4' } = 0; $start_ref{ 'L3' } = 0; $start_ref{ 'L2' } = $y + 1; $start_ref{ 'L1' } = $x; } elsif (($x + 1) < scalar(@{ $pages })) { $start_ref{ 'L4' } = 0; $start_ref{ 'L3' } = 0; $start_ref{ 'L2' } = 0; $start_ref{ 'L1' } = $x + 1; } else { # What the heck, the beginning is at the end of the document. } $start_found = 1; last; } } if ($start_found == 1) { last; } } if ($start_found == 1) { last; } } if ($start_found == 1) { last; } } # Reference length my $reference_length = 0; # Citation my $reference_text = ""; # Reference not found if (! exists $start_ref{ 'L1' }) { return (\%start_ref, \%end_ref, \$reference_text); } # Foreach line in the document after the start of the reference, check if it is the end of a reference using regular expression for (my $x = $start_ref{ 'L1' }; $x < scalar(@{ $pages }); $x++) { # All columns in one page my $columns = $pages->[ $x ]->get_objs_ref(); my $start_column = ($x == $start_ref{ 'L1' }) ? $start_ref{ 'L2' } : 0; for (my $y = $start_column; $y < scalar(@{ $columns }); $y++) { # All paragraphs in one column my $paras = $columns->[ $y ]->get_objs_ref(); my $start_para = (($x == $start_ref{ 'L1' }) && ($y == $start_ref{ 'L2' })) ? $start_ref{ 'L3' } : 0; for (my $z = $start_para; $z < scalar(@{ $paras }); $z++) { # All lines in one paragraph my $lines = $paras->[ $z ]->get_objs_ref(); my $start_line = (($x == $start_ref{ 'L1' }) && ($y == $start_ref{ 'L2' }) && ($z == $start_ref{ 'L3' })) ? $start_ref{ 'L4' } : 0; for (my $t = $start_line; $t < scalar(@{ $lines }); $t++) { my $ln_content = $lines->[ $t ]->get_content(); # Just a temporary variable my $tmp = undef; # Is it the end? if ($ln_content =~ m/^([\s\d\.]+)?(Acknowledge?ments?|Autobiographical|Tables?|Appendix|Exhibit|Annex|Fig|Notes?)(.*?)$/) { # Then save its location if ($t == 0) { if ($z == 0) { if ($y == 0) { if ($x == 0) { # What the heck, the end is at the beginning of the document. } else { $end_ref{ 'L1' } = $x - 1; $tmp = $pages->[ $x - 1 ]->get_objs_ref(); $end_ref{ 'L2' } = scalar(@{ $tmp }) - 1; $tmp = $tmp->[ -1 ]->get_objs_ref(); $end_ref{ 'L3' } = scalar(@{ $tmp }) - 1; $tmp = $tmp->[ -1 ]->get_objs_ref(); $end_ref{ 'L4' } = scalar(@{ $tmp }) - 1; } } else { $end_ref{ 'L1' } = $x; $end_ref{ 'L2' } = $y - 1; $tmp = $columns->[ $y - 1 ]->get_objs_ref(); $end_ref{ 'L3' } = scalar(@{ $tmp }) - 1; $tmp = $tmp->[ -1 ]->get_objs_ref(); $end_ref{ 'L4' } = scalar(@{ $tmp }) - 1; } } else { $end_ref{ 'L1' } = $x; $end_ref{ 'L2' } = $y; $end_ref{ 'L3' } = $z - 1; $tmp = $paras->[ $z - 1 ]->get_objs_ref(); $end_ref{ 'L4' } = scalar(@{ $tmp }) - 1; } } else { $end_ref{ 'L1' } = $x; $end_ref{ 'L2' } = $y; $end_ref{ 'L3' } = $z; $end_ref{ 'L4' } = $t - 1; } $end_found = 1; last; } # This is is not the end of the reference, so, logically, it belongs to the reference else { push @cit_addrs, { 'L1' => $x, 'L2' => $y, 'L3' => $z, 'L4' => $t }; } $reference_length += length($ln_content); $reference_text .= $ln_content . "\n"; } if ($end_found == 1) { last; } } if ($end_found == 1) { last; } } if ($end_found == 1) { last; } } # End of the reference not found, asume that it's the end of the document if (! exists $end_ref{ 'L1' }) { # Just a temporary variable my $tmp = undef; $end_ref{ 'L1' } = scalar(@{ $pages }) - 1; $tmp = $pages->[ -1 ]->get_objs_ref(); $end_ref{ 'L2' } = scalar(@{ $tmp }) - 1; $tmp = $tmp->[ -1 ]->get_objs_ref(); $end_ref{ 'L3' } = scalar(@{ $tmp }) - 1; $tmp = $tmp->[ -1 ]->get_objs_ref(); $end_ref{ 'L4' } = scalar(@{ $tmp }) - 1; } # Odd case: when citation is longer than the content itself, what should we do? if (1.8 * $reference_length >= 0.8 * length($doc->get_content())) { print STDERR "Citation text longer than article body: ignoring\n"; %start_ref = (); %end_ref = (); $reference_text = ""; return (\%start_ref, \%end_ref, \$reference_text); } # Now we have the citation text return (\%start_ref, \%end_ref, \$reference_text, \@cit_addrs); } ### # Looks for reference section markers in the supplied text and # separates the citation text from the body text based on these # indicators. If it looks like there is a reference section marker # too early in the document, this procedure will try to find later # ones. If the final reference section is still too long, an empty # citation text string will be returned. Returns references to # the citation text, normalized body text, and original body text. ### sub FindCitationText { my ($rtext, $pos_array) = @_; # Save the text my $text = $$rtext; my $bodytext = ""; my $citetext = ""; ### # Corrected by Cheong Chi Hong 2 Feb 2010 # while ($text =~ m/\b(References?|REFERENCES?|Bibliography|BIBLIOGRAPHY|References?\s+and\s+Notes?|References?\s+Cited|REFERENCE?\s+CITED|REFERENCES?\s+AND\s+NOTES?):?\s*\n+/sg) # { ### ### # Corrected by Huy Do, 15 Jan 2011 # while ($text =~ m/\b(References?|REFERENCES?|Bibliography|BIBLIOGRAPHY|References?\s+and\s+Notes?|References?\s+Cited|REFERENCES?\s+CITED|REFERENCES?\s+AND\s+NOTES?):?\s*\n+/sg) # { ### while ($text =~ m/\b(References?|REFERENCES?|Bibliography|BIBLIOGRAPHY|References?\s+and\s+Notes?|References?\s+Cited|REFERENCES?\s+CITED|REFERENCES?\s+AND\s+NOTES?|LITERATURE?\s+CITED?):?\s*\n+/sg) { $bodytext = substr $text, 0, pos $text; $citetext = substr $text, pos $text unless (pos $text < 1); } # No citation if ($citetext eq "") { print STDERR "Citation text cannot be found: ignoring", "\n"; return \$citetext, NormalizeBodyText(\$bodytext, $pos_array), \$bodytext; } # Odd case: when citation is longer than the content itself, what should we do? if (length($citetext) >= 0.8 * length($bodytext)) { print STDERR "Citation text longer than article body: ignoring\n"; return \$citetext, NormalizeBodyText(\$bodytext, $pos_array), \$bodytext; } # Citation stops when another section starts my ($scitetext, $tmp) = split(/^([\s\d\.]+)?(Acknowledge?ments?|Autobiographical|Tables?|Appendix|Exhibit|Annex|Fig|Notes?)(.*?)\n+/m, $citetext); if (length($scitetext) > 0) { $citetext = $scitetext; } # No citation exists if ($citetext eq '0' || ! defined $citetext) { print STDERR "warning: no citation text found\n"; } # Now we have the citation text return (NormalizeCiteText(\$citetext), NormalizeBodyText(\$bodytext, $pos_array), \$bodytext); } ### # Huydhn: find citation section in raw text # This function is used exclusively when the citation # section is provided by sectlabel sub FindCitationText2 { my ($rtext, $rcit_lines, $pos_array) = @_; # Citation and body text my $citetext = ""; my $bodytext = ""; # All line in the document my @lines = split(/\n/, $$rtext); # Append all lines that belong to the citation foreach my $line_index (@{ $rcit_lines }) { $citetext = $citetext . $lines[ $line_index ] . "\n"; } # If a line is not in @cit_lines, it belongs to the body text for (my $i = 0; $i < $rcit_lines->[ 0 ]; $i++) { $bodytext = $bodytext . $lines[ $i ] . "\n"; } # Odd case: when citation is longer than the content itself, what should we do? if (length($citetext) >= 0.8 * length($bodytext)) { print STDERR "Citation text longer than article body: ignoring\n"; return \$citetext, NormalizeBodyText(\$bodytext, $pos_array), \$bodytext; } # Now we have the citation text return (NormalizeCiteText(\$citetext), NormalizeBodyText(\$bodytext, $pos_array), \$bodytext); } ## # Removes lines that appear to be junk from the citation text. ## sub NormalizeCiteText { my ($rcitetext) = @_; my @newlines = (); my @lines = split "\n", $$rcitetext; ### # Modified by Artemy Kolchinsky (v090625) # In some cases, I had situations like: # Smith B, "Blah Blah." Journal1, 2000, p. 23- # 85 # Here, the line consisting of '85' is part of the citation and shouldn't be dropped, # even though it only consist of numeric characters. The way I went about this is # that I dropped those lines consisting of only spacing characters, *or* only numeric # characters *if the previous line did not end on a hyphen*. ### my $oldline = ""; foreach my $line (@lines) { $line =~ s/^\s*//g; # Dropped leading spaces added by Thang (v090625) $line =~ s/\s*$//g; # Dropped trailing spaces added by Thang (v090625) if ($line =~ m/^\s*$/ || ($oldline !~ m/\-$/ && $line =~ m/^\d*$/)) { $oldline = $line; next; } $oldline = $line; push @newlines, $line; } ### # End modified by Artemy Kolchinsky (v090625) ### my $newtext = join "\n", @newlines; return \$newtext; } ### # Thang May 2010 # Address the problem Nick mentioned in method normalizeBodyText() # This method handle multiple bracket references in a line, e.g "abc [1, 2-5, 11] def [1-3, 5] ghi jkl" # + this method maps the position of tokens in normalized body text --> positions of tokens in body text (for later retrieve context positions) ### sub ExpandBracketMarker { my ($line, $pos_array, $token_count) = @_; # $line = "abc [1, 2-5, 11] def [1-3, 5] ghi jkl"; # $line = "abc[1, 2-5, 11]def[1-3, 5]ghi jkl"; # $line = "abc def ghi jkl"; my $count = 0; my $front = ""; my $match = ""; my $remain = $line; my $newline = ""; my $space_flag = 0; while($line =~ m/\[(\d+[,;] *)*((\d+)-(\d+))([,;] *\d+)*\]/g) { $front = $`; $match = $&; $line = $'; # Handle front part if($space_flag == 1) { $newline .= " "; } $newline .= $front; my @tokens = split(/\s+/, $front); my $length = scalar(@tokens); for(my $i=0; $i < $length; $i++) { if($i < ($length -1) || $front =~ / $/) { #print STDERR "$tokens[$i] --> ".$token_count."\n"; push(@{ $pos_array }, $token_count++); } } # Handle match part my $num_new_tokens = 0; if ($match =~ /^\[(\d+[,;] *)*((\d+)-(\d+))([,;] *\d+)*\]$/) { $num_new_tokens = $4 - $3; if ($num_new_tokens > 0) { $match = "[" . $1 . TransformMarker($3, $4) . $5 . "]"; } else { $num_new_tokens = 0; } } $newline .= $match; @tokens = split(/\s+/, $match); $length = scalar(@tokens); for(my $i=0; $i < $length; $i++) { if($i < ($length -1) || $line =~ /^ /) { #print STDERR "$tokens[$i] --> ".$token_count."\n"; if ($i >= ($length - $num_new_tokens-1) && $i < ($length -1)) { push(@{ $pos_array }, $token_count); } else { push(@{ $pos_array }, $token_count++); } } } if ($line =~ /^ /) { $space_flag = 1; $line =~ s/^\s+//; } else { $space_flag = 0; } $count++; } if($space_flag == 1) { $newline .= " "; } $newline .= $line; my @tokens = split(/\s+/, $line); my $length = scalar(@tokens); for(my $i=0; $i < $length; $i++) { #print STDERR "$tokens[$i] --> ".$token_count."\n"; push(@{ $pos_array }, $token_count++); } return ($newline, $token_count); } ### # Removes lines that appear to be junk from the body text, # de-hyphenates words where a hyphen occurs at the end of # a line, and normalizes strings of blank spaces to only # single blancks. # # HISTORY: Nick (v081201) # # In some publications markers with a range such as [1-5] or [1-12, 16] # are used. ParsCit cannot find these markers. I added a simple # workaround to PreProcess::normalizeBodyText. The markers with range # are replaced by markers containing every number of the range # (e.g. [1-5] replaced by [1, 2, 3, 4, 5]). ### sub NormalizeBodyText { my ($rtext, $pos_array) = @_; my @lines = split "\n", $$rtext; my $text = ""; my $token_count = 0; foreach my $line (@lines) { $line =~ s/^\s+//; # Thang May 2010: trip leading spaces my @tmp_pos_array = (); ($line, $token_count) = ExpandBracketMarker($line, \@tmp_pos_array, $token_count); # Thang May 2010 my @tokens = split(/\s+/, $line); if(scalar(@tokens) != scalar(@tmp_pos_array)) { die "scalar(@tokens) != scalar(@tmp_pos_array)\n$line\n"; } #$line =~ s/\[(\d+[,;] *)*((\d+)-(\d+))([,;] *\d+)*\]/"[".$1.transformMarker($3,$4).$5."]"/e; if ($line =~ m/^\s*$/) { next; } ### # Modified by Artemy Kolchinsky (v090625) # !!! merge without removing "-" if preceeded by numbers... ### if ($text =~ s/([A-Za-z])\-$/$1/) { $text .= $line; shift(@tmp_pos_array); } else { if ($text !~ m/\-\s+$/ && $text ne "") { $text .= " " } # Thang May 2010: change m/\-\s*$/ -> m/\-\s+$/ $text .= $line; } push(@{$pos_array}, @tmp_pos_array); ### # End modified by Artemy Kolchinsky (v090625) ### } $text =~ s/\s{2,}/ /g; return \$text; } # sub TransformMarker { my ($first_number, $second_number) = @_; my $new_marker = $first_number; for (my $i = ($first_number + 1) ; $i <= $second_number ; $i++) { $new_marker .= ", " . $i; } return $new_marker; } ### # Controls the process by which citations are segmented, based # on the result of trying to guess the type of citation marker # used in the reference section. Returns a reference to a list # of citation objects. ### sub SegmentCitations { my ($rcite_text) = @_; my $marker_type = GuessMarkerType($rcite_text); my $rcitations = undef; if ($marker_type ne 'UNKNOWN') { $rcitations = SplitCitationsByMarker($rcite_text, $marker_type); } else { $rcitations = SplitUnmarkedCitations($rcite_text); } return $rcitations; } ### # Segments citations that have explicit markers in the # reference section. Whenever a new line starts with an # expression that matches what we'd expect of a marker, # a new citation is started. Returns a reference to a # list of citation objects. ### sub SplitCitationsByMarker { my ($rcite_text, $marker_type) = @_; my @citations = (); my $current_citation = new ParsCit::Citation(); my $current_citation_string = undef; # TODO: Might want to add a check that marker number is # increasing as we'd expect, if the marker is numeric. foreach my $line (split "\n", $$rcite_text) { if ($line =~ m/^\s*($marker_types{ $marker_type })\s*(.*)$/) { my ($marker, $cite_string) = ($1, $2); if (defined $current_citation_string) { $current_citation->setString($current_citation_string); push @citations, $current_citation; $current_citation_string = undef; } $current_citation = new ParsCit::Citation(); $current_citation->setMarkerType($marker_type); $current_citation->setMarker($marker); $current_citation_string = $cite_string; } else { ### # Modified by Artemy Kolchinsky (v090625) # !!! merge without removing "-" if preceeded by numbers... ### if ((defined $current_citation_string) && ($current_citation_string =~ m/[A-Za-z]\-$/)) { # Merge words when lines are hyphenated $current_citation_string =~ s/\-$//; $current_citation_string .= $line; } else { if ((! defined $current_citation_string) || ($current_citation_string !~ m/\-\s*$/)) { $current_citation_string .= " "; } #!!! $current_citation_string .= $line; } ### # End modified by Artemy Kolchinsky (v090625) ### } } # Last citation if (defined $current_citation && defined $current_citation_string) { $current_citation->setString($current_citation_string); push @citations, $current_citation; } # Now, we have an array of separated citations return \@citations; } ### # Uses several heuristics to decide where individual citations # begin and end based on the length of previous lines, strings # that look like author lists, and punctuation. Returns a # reference to a list of citation objects. # # HISTORY: Modified in 081201 by Nick and J\"{o}ran. # # There was an error with unmarkedCitations. ParsCit ignored the last # citation in the reference section due to a simple error in a for loop. # In PreProcess::splitUnmarkedCitations (line 241; line 258 in my # modified file) "$k<$#citeStarts" is used as exit condition. It should # be "<=" and not "<" beause $#citeStarts provides the last index and # not the length of the array. # # HISTORY: Modified in 081201 by Min to remove superfluous print statements ### sub SplitUnmarkedCitations { my ($rcite_text) = @_; my @content = split "\n", $$rcite_text; my $cite_start = 0; my @cite_starts = (); my @citations = (); ### # Huydhn: when a line is an author line (the line at the start of # a citation with a long list of author), the next line cannot be # the start of another (consequence) citation. This next line should # be the next part of the current citation after the author line. ### my $last_author_line = undef; for (my $i = 0; $i <= $#content; $i++) { if ($content[ $i ] =~ m/\b\(?[1-2][0-9]{3}[\p{IsLower}]?[\)?\s,\.]*(\s|\b)/s) { for (my $k = $i; $k > $cite_start; $k--) { if ($content[ $k ] =~ m/\s*[\p{IsUpper}]/g) { ### # Huydhn: The previous line is an author line, so this line # cannot be the start of another citation if ($last_author_line == $k - 1) { next; } # If length of previous line is extremely # small, then start a new citation here. if (length($content[ $k - 1 ]) < 2) { $cite_start = $k; last; } # Start looking backwards for lines that could # be author lists - these usually start the # citation, have several separation characters (,;), # and shouldn't contain any numbers. my $beginning_author_line = -1; for (my $j = $k - 1; $j > $cite_start; $j--) { if ($content[ $j ] =~ m/\d/) { last; } $_ = $content[ $j ]; my $n_sep = s/([,;])/$1/g; if ($n_sep >= 3) { if (($content[ $j - 1 ] =~ m/\.\s*$/) || $j == 0) { $beginning_author_line = $j; } } else { last; } } if ($beginning_author_line >= 0) { $cite_start = $beginning_author_line; ### # Huydhn: see $last_author_line ### $last_author_line = $beginning_author_line; last; } # Now that the backwards author search failed # to find any extra lines, start a new citation # here if the previous line ends with a ".". ### # Modified by Artemy Kolchinsky (v090625) # A new citation is started if the previous line ended with # a period, but not if it ended with a period, something else, # and then a period. This is to avoid assuming that abbrevations, # like U.S.A. , indicate the end of a cite. Also, a new cite is # started only if the current line does not begin with a series of # 4 digits. This helped avoid some mis-parsed citations for me. # The new if-statement read like: ### if ($content[ $k - 1 ] =~ m/[^\.].\.\s*$/ && $content[ $k ] !~ m/^\d\d\d\d/) { $cite_start = $k; last; } } } # End of for push @cite_starts, $cite_start unless (($cite_start <= $cite_starts[ $#cite_starts ]) && ($cite_start != 0)); } } for (my $k = 0; $k <= $#cite_starts; $k++) { my $first_line = $cite_starts[ $k ]; my $last_line = ($k == $#cite_starts) ? $#content : ($cite_starts[ $k + 1 ] - 1); my $cite_string = MergeLines(join "\n", @content[ $first_line .. $last_line ]); my $citation = new ParsCit::Citation(); $citation->setString($cite_string); push @citations, $citation; } # And then from nothing came everything return \@citations; } ### # Controls the process by which citations are segmented. # Input includes XML information. # Returns a reference to a list of citation objects. # # Added by Huydhn, 13 Jan 2011 ### sub SegmentCitationsXML { my ($rcite_text_from_xml, $tmp_file) = @_; # TODO: Need to be removed my $marker_type = GuessMarkerType($rcite_text_from_xml); my $rcitations = undef; if ($marker_type ne 'UNKNOWN') { # TODO: Need to be removed $rcitations = SplitCitationsByMarker($rcite_text_from_xml, $marker_type); } else { # Huydhn: split reference using crf++ model $rcitations = SplitUnmarkedCitations2($tmp_file); } return $rcitations; } ### # Replace heuristics rules with crf++ model based on both textual # and XML features from Omnipage. # # HISTORY: Added in 100111 by Huy Do ### sub SplitUnmarkedCitations2 { my ($infile) = @_; # Citation list my @citations = (); # Run the crf++ my $outfile = $infile . "_split.dec"; if (ParsCit::Tr2crfpp::SplitReference($infile, $outfile)) { my $file_handle = undef; unless(open($file_handle, "<:utf8", $outfile)) { fatal("Could not open file: $!"); return; } # Read all lines my @lines = (); while(<$file_handle>) { chomp(); push @lines, $_; } close $file_handle; my $cit_str = ""; for (my $i = 0; $i < scalar(@lines); $i++) { # Get the class of the file: "parsCit_begin", "parsCit_continue", or "parsCit_end" my @tokens = split(/\s+/, $lines[$i]); my $class = $tokens[ $#tokens ]; # Line content my $ln_con = undef; $ln_con = $tokens[ 0 ]; # Replace the ||| sequence with \s $ln_con =~ s/\|\|\|/ /g; # Beginning of a citation if ($class eq "parsCit_begin") { # Save the previous citation if ($cit_str ne "") { my $citation = new ParsCit::Citation(); # Clean up the citation text first my $one_cit_str = MergeLines($cit_str); # Save the citation $citation->setString($one_cit_str); push @citations, $citation; } # Create new citation $cit_str = $ln_con; } # Inside a citation elsif ($class ne "parsCit_unknown") { $cit_str = $cit_str . "\n" . $ln_con; } } # Last citation if ($cit_str ne "") { my $citation = new ParsCit::Citation(); # Clean up the citation text first my $one_cit_str = MergeLines($cit_str); # Save the citation $citation->setString($one_cit_str); push @citations, $citation; } } unlink($infile); unlink($outfile); # Our work here is done return \@citations; } ### # Merges lines of text by dehyphenating where appropriate, # with normal spacing. ### sub MergeLines { my ($text) = shift; my @lines = split "\n", $text; my $merged_text = ""; foreach my $line (@lines) { $line = Trim($line); ### # Modified by Artemy Kolchinsky (v090625) # # !!! merge without removing "-" if preceeded by numbers... ### if ($merged_text =~ m/[A-Za-z]\-$/) { # Merge words when lines are hyphenated $merged_text =~ s/\-$//; $merged_text .= $line; } else { if ($merged_text !~ m/\-\s*$/) { $merged_text .= " " } #!!! $merged_text .= $line; } ### # End modified by Artemy Kolchinsky (v090625) ### } return Trim($merged_text); } ### # Uses a list of regular expressions that match common citation # markers to count the number of matches for each type in the # text. If a sufficient number of matches to a particular type # are found, we can be reasonably sure of the type. ### sub GuessMarkerType { my ($rcite_text) = @_; my $marker_type = 'UNKNOWN'; my %marker_observations = (); foreach my $type (keys %marker_types) { $marker_observations{$type} = 0; } my $cite_text = "\n" . $$rcite_text; $_ = $cite_text; my $n_lines = s/\n/\n/gs - 1; while ($cite_text =~ m/\n\s*($marker_types{'SQUARE'}([^\n]){10})/sg) { $marker_observations{'SQUARE'}++; } while ($cite_text =~ m/\n\s*($marker_types{'PAREN'}([^\n]){10})/sg) { $marker_observations{'PAREN'}++; } ### # Modified by Artemy Kolchinsky (v090625): remove space after {10}) ### while ($cite_text =~ m/\n\s*($marker_types{'NAKEDNUM'} [^\n]{10})/sg) { $marker_observations{'NAKEDNUM'}++; } while ($cite_text =~ m/\n\s*$marker_types{'NAKEDNUMDOT'}([^\n]){10}/sg) { $marker_observations{'NAKEDNUMDOT'}++; } my @sorted_observations = sort { $marker_observations{ $b } <=> $marker_observations{ $a } } keys %marker_observations; my $min_markers = $n_lines / 6; if ($marker_observations{ $sorted_observations[0] } >= $min_markers) { $marker_type = $sorted_observations[0]; } return $marker_type; } sub Trim { my $text = shift; $text =~ s/^\s+//; $text =~ s/\s+$//; return $text; } 1;