package SectLabel::PreProcess; ### # Utilities for finding header, body, and reference. # Avoid normalization to maintain consistent number of lines in a document # Simplified from ParsCit::PreProcess # # Minh-Thang Luong, v100401 ### use utf8; use strict; ### # Looks for header section markers in the supplied text and # separates the header text from the body text based on these # indicators. If it looks like there is a header section marker # too late, an empty header text string will be returned. # Input: reference to an array of lines, line id to start process, number of lines (start_id < num_lines) # Output: header length, body length, body start id) ### sub FindHeaderText { my ($lines, $start_id, $num_lines) = @_; if($start_id >= $num_lines) { die "Die in SectLabel::PreProcess::findHeaderText: start id $start_id >= num lines $num_lines\n"; } my $body_start_id = $start_id; for(; $body_start_id < $num_lines; $body_start_id++) { if($lines->[$body_start_id] =~ /^(.*?)\b(Abstract|ABSTRACT|Introductions?|INTRODUCTIONS?)\b(.*?):?\s*$/) { # There are trailing text after the word introduction if (CountTokens($3) > 0) { # INTRODUCTION AND BACKGROUND if($3 =~ /background/i) { last; } } else { last; } } } my $header_length = $body_start_id - $start_id; my $body_length = $num_lines - $body_start_id; if ($header_length >= 0.8*$body_length) { print STDERR "Header text $header_length longer than 80% article body length $body_length: ignoring\n"; $body_start_id = $start_id; $header_length = 0; $body_length = $num_lines - $body_start_id; } if ($header_length == 0) { print STDERR "warning: no header text found\n"; } return ($header_length, $body_length, $body_start_id); } ### # Looks for reference section markers in the supplied text and # separates the citation text from the body text based on these # indicators. If it looks like there is a reference section marker # too early in the document, this procedure will try to find later # ones. If the final reference section is still too long, an empty # citation text string will be returned. ## Input: reference to an array of lines, line id to start process, number of lines (start_id < num_lines) ## Output: body length, citation length, body end id ### sub FindCitationText { my ($lines, $start_id, $num_lines) = @_; if ($start_id >= $num_lines) { die "Die in SectLabel::PreProcess::findCitationText: start id $start_id >= num lines $num_lines\n"; } my $body_end_id = ($num_lines - 1); for(; $body_end_id >= $start_id; $body_end_id--) { if ($lines->[$body_end_id] =~ /(References?|REFERENCES?|Bibliography|BIBLIOGRAPHY|References?\s+and\s+Notes?|References?\s+Cited|REFERENCES?\s+CITED|REFERENCES?\s+AND\s+NOTES?):?\s*$/) { last; } } my $body_length = $body_end_id - $start_id + 1; my $citation_length = $num_lines -1 - $body_end_id; if ($citation_length >= 0.8*$body_length) { print STDERR "Citation text $citation_length longer than 80% article body length $body_length: ignoring\n"; $body_end_id = ($num_lines - 1); $citation_length = 0; $body_length = $body_end_id - $start_id + 1; } if ($citation_length == 0) { print STDERR "warning: no citation text found\n"; } return ($body_length, $citation_length, $body_end_id); } sub CountTokens { my ($text) = @_; $text =~ s/^\s+//; # Trip leading spaces $text =~ s/\s+$//; # Trip trailing spaces my @tokens = split(/\s+/, $text); return scalar(@tokens); } 1;