# # Copyright 2007 Penn State University # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # package HeaderParse::API::AssembleXMLMetadata; # example input: # 5 #Third ILOG International Users Meeting , 9 10 July 1997 , Paris , France #Daily management of an earth observation satellite : #comparison of ILOG Solver with dedicated algorithms #for Valued Constraint Satisfaction Problems #Michel Lemaitre #G'erard Verfaillie #ONERA/CERT #2 , avenue #
Edouard Belin -- BP 4025 --
#
31055 Toulouse cedex 4 -- France
#fMichel.Lemaitre,Gerard.Verfaillieg@cert.fr ################Function Description######################## # Find the name and its affiliation/address mapping #Assumption: #(1) It is valid to split the header by authors into chunks; each resulting chunk has complete informaiton of the authors in that chunk #(2) Use edit-distance to find the mapping between authors and emails. #For each author chunk with N authors. #case 1: N = 1; # (a) if (exists following affi. and addr.) { # combine the following affi. and addrs. # } # (b) else # warning #case 2: N > 1; # (a) if the following affi. and addrs == 1 # these N people share the affi. and addr. # (b) if the following affi. and addrs == N # map 1-1 # (c) otherwise # warning #package finalize_metata_extraction_v4; use utf8; use HeaderParse::Config::API_Config; use Data::Dumper; use String::Approx 'adist'; use HeaderParse::API::Function qw(&weired_author); use CSXUtil::SafeText qw(&cleanXML &cleanAll); sub assemble(){ my $rstr = shift; $$rstr =~ s/^\s+//g; $$rstr =~ s/\s+$//g; my @xml_arr = split(/\s*(\d+)\s*<\/DID>/, $$rstr); $did=1; $xml_hash{$did} = $$rstr; #turn arr into hash; splice(@xml_arr,0,1); my %xml_hash_parsed; #start parsing authors and all their attributes ($xml_hash_parsed{$did}, $uncertain) = &parse_xml($did, $xml_hash{$did},$uncertain_addr); $xml_hash_parsed{$did}{raw} = $xml_hash{$did}; delete($xml_hash{$did}); if ($uncertain) { print STDERR "\n\n$did has mismatched address parsing \n"; $uncertain_addr++; } my $handle = Data::Dumper->new([\%xml_hash_parsed]); $$rstr = $handle->Dump; my $rFinalStr = &output_xml(\%xml_hash_parsed); return $rFinalStr; } #cluster needs initialization --the first cluster! sub parse_xml () { my $did = shift; my $str = shift; my $uncertain_addr = shift; my %xml_hash = (); my @lines = split(/\s*\n\s*/, $str); my $pre_stat = ""; my %pre_email = (); #pre address/affiliation info. my $pre_cluster_id = ""; my $pre_add_cluster_id = ""; my $cluster_affi_exist = 0; # may not be useful my $cluster_addr_exist = 0; #may not be useful my $abstractComplete = 0; my $line_count = 0; for my $i(0 .. $#lines) { my $line = $lines[$i]; $line = &string_clean($line); if ($line =~ /^\s*$/) {next;} if ($line =~ /\<(\w+)\>(.*)\<\/\w+\>/) { my $tag = $1; my $content = $2; #$content = &lclean($content); $line_count++; $content = &string_clean($content); if ($pre_stat ne $tag) { #different tags if ($tag =~ /abstract/) { if (!defined $xml_hash{$tag}) { $xml_hash{$tag} .= "$content"; } else { $abstractComplete = 1; } } elsif ($tag =~ /author/) { $content =~ s/\s*(\,|\;)\s*/ /g; $content = &string_clean($content); $xml_hash{cluster_num}++; my $cluster_id = $xml_hash{cluster_num}; $xml_hash{cluster}{$cluster_id}{start} = $line_count; $xml_hash{cluster}{$cluster_id}{end} = $line_count; #heuristically judge the correctness of name parsing, and clean names my @multi_names = split(/\s+and\s+/i, $content); for my $i(0 .. $#multi_names) { my $name = $multi_names[$i]; my ($weireness, $clean_name) = &weired_author($name); if ($weireness) {next;} $xml_hash{cluster}{$cluster_id}{author_num}++; my $author_id = $xml_hash{cluster}{$cluster_id}{author_num}; $xml_hash{cluster}{$cluster_id}{author}{$author_id}{name} = $clean_name; } #within cluster parameters update $cluster_affi_exist = 0; $cluster_addr_exist = 0; }elsif ($tag =~ /affiliation/) { my $cluster_id = $xml_hash{cluster_num}; if ($cluster_id <0) { print STDERR "warning $did has affiliations ahead of authors \n"; }else{ #start a new add_cluster regardless what different tags the previous line has $xml_hash{cluster}{$cluster_id}{add_cluster_num}++; my $add_cluster_id = $xml_hash{cluster}{$cluster_id}{add_cluster_num}; $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{affi_num}++; my $affi_id = $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{affi_num}; $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{affi}{$affi_id} = $content; #not good if ($pre_stat eq "email") { $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{pre_email} = %pre_email; } } $cluster_affi_exist =1; }elsif ($tag =~ /address/) { my $cluster_id = $xml_hash{cluster_num}; if ($cluster_id <0) { #print "warning $did has affiliations ahead of authors \n"; }else{ if ($pre_stat !~ /affiliation/) { $xml_hash{cluster}{$cluster_id}{add_cluster_num}++; } my $add_cluster_id = $xml_hash{cluster}{$cluster_id}{add_cluster_num}; $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{addr_num}++; my $addr_id = $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{addr_num}; $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{addr}{$addr_id} = $content; #not good if ($pre_stat eq "email") { $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{pre_email} = %pre_email; } } $cluster_addr_exist =1; }elsif ($tag =~ /email/) { # other tags if ($content =~ /\@/) { $parsed_emails = &parse_email($content); #concatenate these emails to the new one for my $i(0 .. $#$parsed_emails) { $$parsed_emails[$i] = &string_clean($$parsed_emails[$i]); $xml_hash{email_num}++; my $email_id = $xml_hash{email_num}; $xml_hash{email}{$email_id} = $$parsed_emails[$i]; $pre_email{$$parsed_emails[$i]}++; #if previous taf is affi/addr. point the pre-affi/addr down to email if (($pre_stat eq "affiliation") || ($pre_stat eq "address")) { $xml_hash{cluster}{$pre_cluster_id}{add_cluster}{$pre_add_cluster_id}{next_email}{$$parsed_emails[$i]} = 1; } } } }else { $xml_hash{$tag}= $content; } }else { #same tags with the previous line my $cluster_id = $xml_hash{cluster_num}; if ($tag =~ /author/) { $content =~ s/\s*(\,|\;)\s*/ /g; $content = &string_clean($content); if ($xml_hash{cluster}{$cluster_id}{end} != ($line_count-1)) { die "SVMHeaderParse: $did cluster assignment inappropriate"; }else { $xml_hash{cluster}{$cluster_id}{end} = $line_count; #heuristically judge the correctness of name parsing, and clean names my @multi_names = split(/\s+and\s+/i, $content); for my $i(0 .. $#multi_names) { my $name = $multi_names[$i]; my ($weireness, $clean_name) = &weired_author($name); if ($weireness) {next;} $xml_hash{cluster}{$cluster_id}{author_num}++; my $author_id = $xml_hash{cluster}{$cluster_id}{author_num}; $xml_hash{cluster}{$cluster_id}{author}{$author_id}{name} = $clean_name; } } }elsif ($tag =~ /affiliation/) { my $add_cluster_id = $xml_hash{cluster}{$cluster_id}{add_cluster_num}; $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{affi_num}++; my $affi_id = $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{affi_num}; $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{affi}{$affi_id} = $content; $cluster_affi_exist =1; }elsif ($tag =~ /address/) { my $add_cluster_id = $xml_hash{cluster}{$cluster_id}{add_cluster_num}; $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{addr_num}++; my $addr_id = $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{addr_num}; $xml_hash{cluster}{$cluster_id}{add_cluster}{$add_cluster_id}{addr}{$addr_id} = $content; $cluster_addr_exist =1; }elsif ($tag =~ /email/) {# other tags if ($content =~ /\@/) { $parsed_emails = &parse_email($content); #concatenate these emails to the new one for my $i(0 .. $#$parsed_emails) { $$parsed_emails[$i] = &string_clean($$parsed_emails[$i]); $xml_hash{email_num}++; my $email_id = $xml_hash{email_num}; $xml_hash{email}{$email_id} = $$parsed_emails[$i]; $pre_email{$$parsed_emails[$i]}++; } } } elsif ($tag =~ /abstract/) { if ($abstractComplete <= 0) { $xml_hash{$tag} .= "\n$content"; } }else { $xml_hash{$tag} .= " $content"; } } #parameters update if (($tag eq "affiliation") || ($tag eq "address")) { $pre_cluster_id = $xml_hash{cluster_num}; $pre_add_cluster_id = $xml_hash{cluster}{$pre_cluster_id}{add_cluster_num}; %pre_email = (); }elsif ($tag ne "email") { %pre_email = (); $pre_cluster_id = ""; $pre_add_cluster_id = ""; } $pre_stat = $tag; } } $xml_hash{'abstractEnded'} = $abstractComplete; #the order of adjusting email and address is non changable $xml_hash = &adjust_email(\%xml_hash); ($xml_hash, $uncertain) = &adjust_addr($xml_hash, $uncertain_addr); return($xml_hash,$uncertain); } sub parse_email(){ my $content = shift; #parse email; email could be separated by only author names, or the whole email addresses my @all_emails = (); $content =~ s/(email|e-mail|e mail)(s)*(\s*\:\s*)*//gi; my @email_parts = split(/\@/, $content); if ($#email_parts < 2) { #only one @ if ($content =~ /\,|;|\{|\}|\[|\]/) { #multiple people with the same email root $content =~ s/\{|\}|\[|\]//g; my ($pre, $last) = split(/\@/, $content); my @authors = split(/\,/, $pre); for my $k(0 .. $#authors) { my $tmp_email = "$authors[$k]"."\@"."$last"; $tmp_email =~ s/^\s+//g; $tmp_email =~ s/\s+$//g; push @all_emails, $tmp_email; } }else { push @all_emails, $content; } }else { # do not consider the case that some @ string has 1+ people (if exists; extend the function) my @emails = (); if ($content =~ /\,|\;/) { @emails = split(/\,|\;/, $content); }else { @emails = split(/\s+/, $content); } for my $k(0 .. $#emails) { push @all_emails, $emails[$k]; } } for (my $i=0; $i<=$#all_emails; $i++) { my $email = $all_emails[$i]; $email =~ s/^[\(\[\s\<]+//; $email =~ s/[\]\)\s\>]+$//; $all_emails[$i] = $email; } return(\@all_emails); } sub adjust_email () { my $adjust_hash= shift; my %author_email = (); my %email_author = (); foreach my $cluster_id (sort {$a<=>$b} keys %{$$adjust_hash{cluster}}) { foreach my $author_id (sort {$a<=>$b} keys %{$$adjust_hash{cluster}{$cluster_id}{author}}) { my $name = $$adjust_hash{cluster}{$cluster_id}{author}{$author_id}{name}; my $name_id = "$cluster_id"."_"."$author_id"; if ($name !~ /\w/) {next;} foreach my $email_id(sort {$a<=>$b} keys %{$$adjust_hash{email}}) { my $email = $$adjust_hash{email}{$email_id}; my ($email_name, $email_root) = split(/\@/,$email); if ($email_name !~ /\w/) {next;} $dist = adist(lc($email_name), lc($name)); $author_email{$name_id}{$email} = abs($dist); $email_author{$email}{$name_id} = abs($dist); } } } foreach my $author_id (keys %author_email) { my $counter = keys %{$author_email{$author_id}}; foreach my $email_id (sort {$author_email{$author_id}{$b}<=>$author_email{$author_id}{$a}} keys %{$author_email{$author_id}}) { $author_email{$author_id}{$email_id}{rank} = $counter; $author_email{$author_id}{$email_id}{score} = $author_email{$author_id}{$email_id}; $counter--; } } foreach my $email_id (keys %email_author) { my $counter = keys %{$email_author{$email_id}}; foreach my $author_id (sort {$email_author{$email_id}{$b}<=>$email_author{$email_id}{$a}} keys %{$email_author{$email_id}}) { $email_author{$email_id}{$author_id}{rank} = $counter; $email_author{$email_id}{$author_id}{score} = $email_author{$email_id}{$author_id}; $counter--; } } my %picked_author; my %picked_email; my %final_map = (); my %sum_hash = (); #rank #add the ranks from two sides -> all combinations foreach my $author_id (sort {$a<=>$b} keys %author_email) { foreach my $email_id (keys %{$author_email{$author_id}}) { my $macro = "$author_id"."<>"."$email_id"; my ($cluster_id, $author_id2) = split(/\_/, $author_id); #my $macro = "$$adjust_hash{cluster}{$cluster_id}{author}{$author_id2}"."<>"."$email_id";; $sum_hash{$macro}{score} = $author_email{$author_id}{$email_id}{score} + $email_author{$email_id}{$author_id}{score}; } } foreach my $macro (sort {$sum_hash{$a}{score} <=> $sum_hash{$b}{score}} keys %sum_hash) { my ($author_id1, $email_id) = split(/<>/,$macro); if ($picked_author{$author_id1} || $picked_email{$email_id}) {next;} $picked_author{$author_id1} = 1; $picked_email{$email_id} = 1; $final_map{$author_id1} = $email_id; my ($cluster_id, $author_id2) = split(/\_/, $author_id1); $$adjust_hash{cluster}{$cluster_id}{author}{$author_id2}{email}=$email_id; } return($adjust_hash); } sub adjust_addr() { my $H= shift; #address/affiliation assignment is unreasonable my $uncertain = 0; foreach my $cluster_id (sort {$a<=>$b} keys %{$$H{cluster}}) { if ($$H{cluster}{$cluster_id}{author_num} eq 1) { if ($$H{cluster}{$cluster_id}{add_cluster_num} > 1) { #for one author - multiple address case; combine all my ($affi, $addr) = &combine_all_addr_set($$H{cluster}{$cluster_id}); if ($affi ne "") { $$H{cluster}{$cluster_id}{author}{1}{affi}=$affi; } if ($addr ne "") { $$H{cluster}{$cluster_id}{author}{1}{addr}=$addr; } }elsif ($$H{cluster}{$cluster_id}{add_cluster_num} eq 1) { my ($affi, $addr) = &combine_first_addr_set($$H{cluster}{$cluster_id}); #print "affi is $affi and addr is $addr \n"; if ($affi ne "") { $$H{cluster}{$cluster_id}{author}{1}{affi}=$affi; } if ($addr ne "") { $$H{cluster}{$cluster_id}{author}{1}{addr}=$addr; } }else { #print "warning: No address and affiliations\n"; } }elsif ($$H{cluster}{$cluster_id}{author_num} > 1) { if ($$H{cluster}{$cluster_id}{add_cluster_num} > 1) { if ($$H{cluster}{$cluster_id}{add_cluster_num} eq $$H{cluster}{$cluster_id}{author_num}) { my $addr_cluster = &combine_addr_set($$H{cluster}{$cluster_id}); #equally assign for my $author_id (1 .. $$H{cluster}{$cluster_id}{author_num}) { $$H{cluster}{$cluster_id}{author}{$author_id}{affi}=$$addr_cluster{$cluster_id}{affi}; $$H{cluster}{$cluster_id}{author}{$author_id}{addr}=$$addr_cluster{$cluster_id}{addr}; } }else { #check if existing emails as separators. my $consonence = &check_email_as_address_separator($$H{cluster}{$cluster_id}{add_cluster}); #emails ahead of each address cluster is the separator for author's address if (($consonence eq "pre") || ($consonence eq "next")) { $$H{cluster}{$cluster_id} = &adjust_cluster_by_email_separator($consonence, $$H{cluster}{$cluster_id}); }else { $uncertain =1; } #another strategies: #first name has first address; last name has last address $$H{cluster}{$cluster_id} = &assign_edge_address($$H{cluster}{$cluster_id}); #other stategies??? } }elsif ($$H{cluster}{$cluster_id}{add_cluster_num} eq 1) { my ($affi, $addr) = &combine_first_addr_set($$H{cluster}{$cluster_id}); #print "affi is $affi and addr is $addr \n"; for my $author_id (1 .. $$H{cluster}{$cluster_id}{author_num}) { $$H{cluster}{$cluster_id}{author}{$author_id}{affi}=$affi; $$H{cluster}{$cluster_id}{author}{$author_id}{addr}=$addr; } }else { #print "warning: No address and affiliations\n"; } } } return($H, $uncertain); } sub combine_first_addr_set() { my $H = shift; my $affi = ""; my $addr = ""; foreach my $affi_id(sort {$a <=> $b} keys %{$$H{'add_cluster'}{1}{affi}}) { $affi .= "\; $$H{'add_cluster'}{1}{affi}{$affi_id}"; } $affi =~ s/^\s*\;\s*//g; foreach my $addr_id(sort {$a <=> $b} keys %{$$H{'add_cluster'}{1}{addr}}) { $addr .= "\; $$H{'add_cluster'}{1}{addr}{$addr_id}"; } $addr =~ s/^\s*\;\s*//g; delete($$H{'add_cluster'}{1}{affi}); delete($$H{'add_cluster'}{1}{addr}); return($affi,$addr); } sub combine_addr_set() { my $H = shift; my %add_cluster = (); foreach my $cluster_id (sort {$a <=> $b} keys %{$$H{'add_cluster'}}) { my $affi = ""; my $addr = ""; foreach my $affi_id(sort {$a <=> $b} keys %{$$H{'add_cluster'}{$cluster_id}{affi}}) { $affi .= "\; $$H{'add_cluster'}{$cluster_id}{affi}{$affi_id}"; } $affi =~ s/^\s*\;\s*//g; foreach my $addr_id(sort {$a <=> $b} keys %{$$H{'add_cluster'}{$cluster_id}{addr}}) { $addr .= "\; $$H{'add_cluster'}{$cluster_id}{addr}{$addr_id}"; } $addr =~ s/^\s*\;\s*//g; if ($affi ne "") { $add_cluster{$cluster_id}{affi} = $affi; } if ($addr ne "") { $add_cluster{$cluster_id}{addr} = $addr; } delete($$H{'add_cluster'}{$cluster_id}{affi}); delete($$H{'add_cluster'}{$cluster_id}{addr}); } return(\%add_cluster); } sub combine_all_addr_set() { my $H = shift; my $affi = ""; my $addr = ""; my %add_cluster = (); foreach my $cluster_id (sort {$a <=> $b} keys %{$$H{'add_cluster'}}) { foreach my $affi_id(sort {$a <=> $b} keys %{$$H{'add_cluster'}{$cluster_id}{affi}}) { $affi .= "\; $$H{'add_cluster'}{$cluster_id}{affi}{$affi_id}"; } foreach my $addr_id(sort {$a <=> $b} keys %{$$H{'add_cluster'}{$cluster_id}{addr}}) { $addr .= "\; $$H{'add_cluster'}{$cluster_id}{addr}{$addr_id}"; } delete($$H{'add_cluster'}{$cluster_id}{affi}); delete($$H{'add_cluster'}{$cluster_id}{addr}); } return($affi,$addr); } sub combine_affi() { my $H = shift; my $affi = ""; foreach my $affi_id (sort {$a <=> $b} keys %{$$H{affi}}) { $affi .= "\; $$H{affi}{$affi_id}"; } $affi =~ s/^\s*\;\s*//g; return($affi); } sub combine_addr() { my $H = shift; my $addr = ""; foreach my $addr_id (sort {$a <=> $b} keys %{$$H{addr}}) { $addr .= "\; $$H{addr}{$addr_id}"; } $addr =~ s/^\s*\;\s*//g; return($addr); } sub check_email_as_address_separator(){ my $H = shift; my $consonence = 0; #0 is null state; -1 is conflict foreach my $add_cluster_id (sort {$a<=>$b} keys %{$H}) { if ($consonence eq "-1") {last;} my $pre_email = $$H{$add_cluster_id}{'pre_email'}; my $next_email = $$H{$add_cluster_id}{'next_email'}; ###################here foreach my $email_key (sort %{$pre_email}) { if ($email_key =~ /\w/) { if (($consonence eq "0") || ($consonence eq "pre")) { $consonence = "pre"; }elsif ($consonence ne "next") { $consonence = -1; } last; } } foreach my $email_key (sort %{$next_email}) { if ($email_key =~ /\w/) { if (($consonence eq "0") || ($consonence eq "next")) { $consonence = "next"; }elsif ($consonence ne "pre") { $consonence = -1; } last; } } } #end of checking emails as the address separators return($consonence); } sub adjust_cluster_by_email_separator() { my $consonence =shift; #$$H{cluster}{$cluster_id}) my $H = shift; if ($consonence eq "pre") { foreach my $add_cluster_id (sort {$a<=>$b} keys %{$$H{add_cluster}}) { my $pre_email_hash = $$H{add_cluster}{$add_cluster_id}{'pre_email'}; #check each email and assign to respective author foreach my $pre_email (keys %{$pre_email_hash}) { foreach my $author_id (keys %{$$H{author}}) { if ($pre_email eq $$H{author}{$author_id}{email}) { #need to combine them first. my $combine_affi = &combine_affi($$H{add_cluster}{$add_cluster_id}); my $combine_addr = &combine_addr($$H{add_cluster}{$add_cluster_id}); $$H{author}{$author_id}{affi} = $combine_affi; $$H{author}{$author_id}{addr} = $combine_addr; } } } } }elsif ($consonence eq "next") { foreach my $add_cluster_id (sort {$a<=>$b} keys %{$$H{add_cluster}}) { my $next_email_hash = $$H{add_cluster}{$add_cluster_id}{'next_email'}; #check each email and assign to respective author foreach my $next_email (keys %{$next_email_hash}) { foreach my $author_id (keys %{$$H{author}}) { if ($next_email eq $$H{author}{$author_id}{email}) { #need to combine them first. my $combine_affi = &combine_affi($$H{add_cluster}{$add_cluster_id}); my $combine_addr = &combine_addr($$H{add_cluster}{$add_cluster_id}); $$H{author}{$author_id}{affi} = $combine_affi; $$H{author}{$author_id}{addr} = $combine_addr; } } } } } return($H); } #assign first name first address and last name last address sub assign_edge_address () { my $H = shift; my $first_name_addr = $$H{author}{1}{addr}; my $first_name_affi = $$H{author}{1}{affi}; my $last_name_addr = $$H{author}{$$H{author_num}}{addr}; my $last_name_affi = $$H{author}{$$H{author_num}}{affi}; #needs to combine add and affi if (($first_name_addr eq "" ) && ($first_name_affi eq "")) { my $combine_affi = &combine_affi($$H{add_cluster}{1}); my $combine_addr = &combine_addr($$H{add_cluster}{1}); $$H{author}{1}{affi} = $combine_affi; $$H{author}{1}{addr} = $combine_addr; } if (($last_name_addr eq "" ) && ($last_name_affi eq "")) { my $combine_affi = &combine_affi($$H{add_cluster}{$$H{add_cluster_num}}); my $combine_addr = &combine_addr($$H{add_cluster}{$$H{add_cluster_num}}); $$H{author}{$$H{author_num}}{affi} = $combine_affi; $$H{author}{$$H{author_num}}{addr} = $combine_addr; } return($H); } sub output_xml(){ my $parsed_hash = shift; my $author_found = 0; my $l_algName = $algName; my $l_algVersion = $algVersion; cleanXML(\$l_algName); cleanXML(\$l_algVersion); my $str = "\n"; foreach my $did (sort {$a <=> $b} keys %{$parsed_hash}) { my $title = $$parsed_hash{$did}{title}; $title = repairPunctuation($title); cleanAll(\$title); $str.="$title\n"; $str.="\n"; foreach my $cluster_id (sort {$a <=> $b} keys %{$$parsed_hash{$did}{cluster}}) { if ($cluster_id =~ /\d+/) { foreach my $author_id ( sort {$a <=> $b} keys %{$$parsed_hash{$did}{cluster}{$cluster_id}{author}}) { $author_found = 1; my $name = $$parsed_hash{$did}{cluster}{$cluster_id}{author}{$author_id}{name}; cleanAll(\$name); $name = normalizeName($name); my $affi = $$parsed_hash{$did}{cluster}{$cluster_id}{author}{$author_id}{affi}; $affi = repairPunctuation($affi); cleanAll(\$affi); my $addr = $$parsed_hash{$did}{cluster}{$cluster_id}{author}{$author_id}{addr}; $addr = repairPunctuation($addr); cleanAll(\$addr); my $email = $$parsed_hash{$did}{cluster}{$cluster_id}{author}{$author_id}{email}; cleanAll(\$email); if ($name =~ /\w/) { $str.="\n"; $str.="$name\n"; if ($affi =~ /\w/) { $str.="$affi\n"; } if ($addr =~ /\w/) { $str.="
$addr
\n"; } if ($email =~ /\w/) { $str.="$email\n"; } $str.="
\n"; } } } } $str.="
\n"; my $keywords = $$parsed_hash{$did}{keyword}; if ($keywords =~ /\w/) { $keywords = repairPunctuation($keywords); my @keywords = normalizeKeywords($keywords); $str .= "\n"; foreach my $keyword (@keywords) { cleanAll(\$keyword); $str .= "$keyword\n"; } $str .= "\n"; } my $abstract = $$parsed_hash{$did}{abstract}; if ($abstract =~ /\w/) { $abstract = repairPunctuation($abstract); $abstract = normalizeAbstract($abstract, $$parsed_hash{$did}{abstractEnded}); cleanAll(\$abstract); $str .= "$abstract\n"; } my $date = $$parsed_hash{$did}{date}; if ($date =~ /\d/) { $date = repairPunctuation($date); cleanAll(\$date); $date = normalizeDate($date); if (defined $date) { $str .= "$date\n"; } } } my $titlelength = length($$parsed_hash{$did}{title}); my $authorcount = scalar keys %{$$parsed_hash{$did}{cluster}{$cluster_id}{author}}; my $validHeader; if(length($$parsed_hash{$did}{title}) > 0 && $author_found) { $validHeader = "1"; } else { $validHeader = "0"; } $str.="$validHeader\n"; $str.="
\n"; return \$str; } sub normalizeName { my $name = shift; my @tokens = split " ", $name; my @newTokens = (); foreach my $token (@tokens) { if ($token =~ m/^and$/i) { next; } push @newTokens, $token; } return join " ", @newTokens; } sub normalizeKeywords { my $text = shift; my @tokens = split '\s*[\:\;\,]\s*', $text; for (my $i=0; $i<=$#tokens; $i++) { $tokens[$i] = trimPunctuation($tokens[$i]); } if ($tokens[0] =~ m/keyword|keyphrase/i) { return @tokens[1..$#tokens]; } return @tokens; } sub normalizeAbstract { my ($text, $abstractEnded) = @_; my @lines = split '\n', $text; if ($#lines < 0) { return ""; } if ($abstractEnded<=0) { my $minLines = 5; my $maxLines = 15; my $lineCount = 0; for (my $i=0; $i<$#lines; $i++) { $lineCount++; if (($lineCount >= $minLines) && $line =~ m/\.\s*$/) { last; } if ($lineCount >= $maxLines) { last; } } @lines = @lines[0..($lineCount-1)]; } my $abstract = ""; foreach my $line (@lines) { if ($line =~ m/\b(?:Abstract|ABSTRACT|abstract|Introduction|INTRODUCTION)\:?\s*$/ || $line =~ m/^\s*$/) { next; } if ($abstract =~ m/\-$/ || $abstract =~ m/^\s*$/s) { $abstract .= $line; } else { $abstract .= " $line"; } } return $abstract; } sub normalizeDate { my $date = shift; if ($date =~ m/(\b\d{4}\b)/) { my $year = $1; my @timeData = localtime(time); my $currentYear = $timeData[5]+1900; if ($year <= $currentYear+3) { return $year; } } return undef; } sub trimPunctuation { my $text = shift; $text =~ s/[\.\,\<\>\?\/\:\;\"\'\{\[\}\]\+\=\_\-\(\)\*\&\^\%\$\#\@\!\~\`\\\|]+\s*$//; $text =~ s/^\s*[\.\,\<\>\?\/\:\;\"\'\{\[\}\]\+\=\_\-\(\)\*\&\^\%\$\#\@\!\~\`\\\|]+//; return $text; } sub string_clean() { my $str = shift; $str =~ s/^\s+//g; $str =~ s/\s+$//g; return($str); } sub new { my $classname = shift; my $self = { XMLindent => ' ' }; my @upperentities = qw (nbsp iexcl cent pound curren yen brvbar sect uml copy ordf laquo not 173 reg macr deg plusmn sup2 sup3 acute micro para middot cedil supl ordm raquo frac14 half frac34 iquest Agrave Aacute Acirc Atilde Auml Aring AElig Ccedil Egrave Eacute Ecirc Euml Igrave Iacute Icirc Iuml ETH Ntilde Ograve Oacute Ocirc Otilde Ouml times Oslash Ugrave Uacute Ucirc Uuml Yacute THORN szlig agrave aacute acirc atilde auml aring aelig ccedil egrave eacute ecirc euml igrave iacute icirc iuml eth ntilde ograve oacute ocirc otilde ouml divide oslash ugrave uacute ucirc uuml yacute thorn yuml); $upperentities[12] = '#173'; $self->{'hashentity'} = {}; for ( my $i=0; $i<=$#upperentities; $i++ ) { my $key = '&'.$upperentities[$i].';'; $self->{'hashentity'}->{$key}=$i+160; } $self->{'hashstr'} = (join (';|', @upperentities)).';'; bless $self, $classname; return $self; } sub char_converter() { my $H = { XMLindent => ' ' }; my @upperentities = qw (nbsp iexcl cent pound curren yen brvbar sect uml copy ordf laquo not 173 reg macr deg plusmn sup2 sup3 acute micro para middot cedil supl ordm raquo frac14 half frac34 iquest Agrave Aacute Acirc Atilde Auml Aring AElig Ccedil Egrave Eacute Ecirc Euml Igrave Iacute Icirc Iuml ETH Ntilde Ograve Oacute Ocirc Otilde Ouml times Oslash Ugrave Uacute Ucirc Uuml Yacute THORN szlig agrave aacute acirc atilde auml aring aelig ccedil egrave eacute ecirc euml igrave iacute icirc iuml eth ntilde ograve oacute ocirc otilde ouml divide oslash ugrave uacute ucirc uuml yacute thorn yuml); $upperentities[12] = '#173'; $H->{'hashentity'} = {}; for ( my $i=0; $i<=$#upperentities; $i++ ) { my $key = '&'.$upperentities[$i].';'; $H->{'hashentity'}->{$key}=$i+160; } $H->{'hashstr'} = (join (';|', @upperentities)).';'; return $H; } sub repairPunctuation { my $text = shift; $text =~ s/ / /gs; $text =~ s/\s([\.\,\;\]\)\:\}\!\?\>\-])/$1/gs; $text =~ s/^\s+//; $text =~ s/\s+$//; return $text; } # clean XML version two - for single-line streams sub lclean { my $t = shift; return undef if (! defined $t ); $H = &char_converter; # make ISOlat1 entities into Unicode character entities $t =~ s/&($H->{'hashstr'})/sprintf ("&#x%04X;", $H->{'hashentity'}->{$&})/geo; # escape non-XML-encoded ampersands (including from other characters sets) $t =~ s/&(?!((\#[0-9]*)|(\#x[0-9]*)|(amp)|(lt)|(gt)|(apos)|(quot));)/&/go; # convert extended ascii into Unicode character entities $t =~ s/[\xa0-\xff]/'&#'.ord ($&).';'/geo; # remove extended ascii that doesnt translate into ISO8859/1 $t =~ s/[\x00-\x08\x0B\x0C\x0E-\x1f\x80-\x9f]//go; # make tags delimiters into entities $t =~ s//>/go; # flatten whitespace $t =~ s/[\s\t\r\n]+/ /go; # kill leading and terminating spaces $t =~ s/^[ ]+(.+)[ ]+$/$1/; return $t; } 1;