package SectLabel::AAMatching;

###
# This package provides methods to solve the matching problem
# between author and affiliation in a pdf
#
# Do Hoang Nhat Huy 21 Apr, 11
###

use strict;

# Dependencies
use POSIX;
use IO::File;
use XML::Writer;
use XML::Writer::String;

use	Class::Struct;

# Local libraries
use SectLabel::Config;
use ParsCit::PostProcess;

use File::Basename;
my $dir = dirname(__FILE__);
my $parscitHome = "$dir/../../";

# Dictionary
my %dict = ();
# CRF++
my $crft = $ENV{'CRFPP_HOME'} ? "$ENV{'CRFPP_HOME'}/bin/crf_test" : "$parscitHome/$SectLabel::Config::crf_test";

# Matching features of each author, including
# Signals
# Coordinations: top, bottom, left, right
# Position: page, sections, paragraph, line
struct aut_rcfeatures =>
{
	signals	=> '@',	

	top		=> '$',
	bottom	=> '$',
	left	=> '$',
	right	=> '$',

	page 	=> '$',
	section	=> '$',
	para	=> '$',
	line	=> '$'
};

# Matching features of each affiliation, including
# Signals
# Coordinations: top, bottom, left, right
# Position: page, sections, paragraph, line
struct aff_rcfeatures =>
{
	signals	=> '@',	

	top		=> '$',
	bottom	=> '$',
	left	=> '$',
	right	=> '$',

	page 	=> '$',
	section	=> '$',
	para	=> '$',
	line	=> '$'
};

# Author
# Affiliation
sub AAMatching
{
	my ($doc, $aut_addrs, $aff_addrs) = @_;

	my $need_object	= 1;
	# Get the author objects
	my $aut_lines	= Omni::Traversal::OmniCollector($doc, $aut_addrs, $need_object);
	# Get the affiliation objects
	my $aff_lines	= Omni::Traversal::OmniCollector($doc, $aff_addrs, $need_object);
	
	# Dictionary
	ReadDict($parscitHome . $SectLabel::Config::dictFile);

	# Authors
	my ($aut_features, $aut_rc_features) = AuthorFeatureExtraction($aut_lines, $aut_addrs);
	# Call CRF
	my ($aut_signal, $aut_rc) = AuthorExtraction($aut_features, $aut_rc_features);

	# Affiliations
	my ($aff_features, $aff_rc_features) = AffiliationFeatureExtraction($aff_lines, $aff_addrs);
	# Call CRF
	my ($aff_signal, $aff_rc, $affs) = AffiliationExtraction($aff_features, $aff_rc_features);

	# Matching features
	my $aa_features = AAFeatureExtraction($aut_rc, $aff_rc);
	# Matching
	my $aa			= AAMatchingImp($aa_features);

=pod
	# DEBUG
	my $aut_handle	= undef;
	my $aff_handle	= undef;
	my $aau_handle	= undef;
	my $aaf_handle	= undef;
	my $aut_debug	= undef;
	my $aff_debug	= undef;
	my $aa_handle	= undef;

	open $aut_handle, ">:utf8", "aut.features"; 
	open $aff_handle, ">:utf8", "aff.features"; 
	open $aau_handle, ">:utf8", "aau.features"; 
	open $aaf_handle, ">:utf8", "aaf.features"; 
	open $aut_debug, ">:utf8", "aut.debug.features"; 
	open $aff_debug, ">:utf8", "aff.debug.features"; 
	open $aa_handle, ">:utf8", "aa.features"; 

	print $aut_handle $aut_features;
	print $aff_handle $aff_features;
	print $aau_handle $aut_rc_features;
	print $aaf_handle $aff_rc_features;
	print $aa_handle $aa_features, "\n";

	foreach my $author (keys %{ $aut_rc } )
	{
		print $aut_debug $author, ": ", "\n";

		foreach my $signal (@{ $aut_rc->{ $author }->signals })
		{
			print $aut_debug "\t", $signal, "\n";
		}

		print $aut_debug "\t", $aut_rc->{ $author }->top, "\n";
		print $aut_debug "\t", $aut_rc->{ $author }->bottom, "\n";
		print $aut_debug "\t", $aut_rc->{ $author }->left, "\n";
		print $aut_debug "\t", $aut_rc->{ $author }->right, "\n";

		print $aut_debug "\t", $aut_rc->{ $author }->page, "\n";
		print $aut_debug "\t", $aut_rc->{ $author }->section, "\n";
		print $aut_debug "\t", $aut_rc->{ $author }->para, "\n";
		print $aut_debug "\t", $aut_rc->{ $author }->line, "\n";
	}

	foreach my $affiliation (keys %{ $aff_rc } )
	{
		print $aff_debug $affiliation, ": ", "\n";
		
		foreach my $signal (@{ $aff_rc->{ $affiliation }->signals })
		{
			print $aff_debug "\t", $signal, "\n";
		}

		print $aff_debug "\t", $aff_rc->{ $affiliation }->top, "\n";
		print $aff_debug "\t", $aff_rc->{ $affiliation }->bottom, "\n";
		print $aff_debug "\t", $aff_rc->{ $affiliation }->left, "\n";
		print $aff_debug "\t", $aff_rc->{ $affiliation }->right, "\n";

		print $aff_debug "\t", $aff_rc->{ $affiliation }->page, "\n";
		print $aff_debug "\t", $aff_rc->{ $affiliation }->section, "\n";
		print $aff_debug "\t", $aff_rc->{ $affiliation }->para, "\n";
		print $aff_debug "\t", $aff_rc->{ $affiliation }->line, "\n";
	}

	close $aut_handle;
	close $aff_handle;
	close $aau_handle;
	close $aaf_handle;
	close $aut_debug;
	close $aff_debug;
	close $aa_handle;
	# END
=cut

	# Do the matching
	# XML string
	my $sxml 	= "";
	# and XML writer
	my $writer	= new XML::Writer(OUTPUT => \$sxml, ENCODING => 'utf-8', DATA_MODE => 'true', DATA_INDENT => 2);	

	# Algorithm
	$writer->startTag("algorithm", "name" => "AAMatching", "version" => $SectLabel::Config::algorithmVersion);	

	# XML header
	my $date = `date`; chomp($date);
	my $time = `date +%s`; chomp($time);	
	# Write XML header
	$writer->startTag("results", "time" => $time, "date" => $date);

	# Write authors
	$writer->startTag("authors");

	# Write the author name and his corresponding institution
	foreach my $author (keys %{ $aut_signal })
	{
		$writer->startTag("author");

		$writer->startTag("fullname", "source" => "parscit");
		$writer->characters($author);
		$writer->endTag("fullname");

		$writer->startTag("institutions");
=pod
		foreach my $signal (@{ $aut_signal->{ $author } })
		{
			$signal =~ s/^\s+|\s+$//g;
			# Skip blank
			if ($signal eq "") { next; }

			$writer->startTag("institution", "symbol" => $signal);
			$writer->characters($aff_signal->{ $signal });
			$writer->endTag("institution");
		}
=cut

		foreach my $affiliation (@{ $aa->{ $author } })
		{
			$writer->startTag("institution");
			$writer->characters($affiliation);
			$writer->endTag("institution");			
		}

		$writer->endTag("institutions");

		$writer->endTag("author");
	}

	# Finish authors
	$writer->endTag("authors");

	# Write institutions
	$writer->startTag("institutions");
	
	# Write the instituion name
	foreach my $institute (@{ $affs })
	{
		$writer->startTag("institution");
		$writer->characters($institute);
		$writer->endTag("institution");	
	}

	$writer->endTag("institutions");

	# Done
	$writer->endTag("results");
	# Done
	$writer->endTag("algorithm");
	# Done
	$writer->end();

	# Return the xml content back to the caller
	return $sxml;
}

# Features of the relational classifier between author and affiliation
sub AAFeatureExtraction
{
	my ($aut_rc, $aff_rc) = @_;	

	# Relational features
	my $features = "";

	# Features between x authors
	foreach my $author (keys %{ $aut_rc })
	{
		my @aut_tokens	= split /\s/, $author;
		my $author_nb	= join '|||', @aut_tokens;

		my $min_aff_x	= undef;
		my $min_dist_x	= LONG_MAX;
		my $min_aff_y	= undef;
		my $min_dist_y	= LONG_MAX;
		# Find the nearest affiliation
		foreach my $aff (keys %{ $aff_rc })
		{
			my $aut_x = ($aut_rc->{ $author }->left + $aut_rc->{ $author }->right) / 2;
			my $aut_y = ($aut_rc->{ $author }->top + $aut_rc->{ $author }->bottom) / 2;

			my $aff_x = ($aff_rc->{ $aff }->left + $aff_rc->{ $aff }->right) / 2;
			my $aff_y = ($aff_rc->{ $aff }->top + $aff_rc->{ $aff }->bottom) / 2;

			my $dis_x = abs( $aut_x - $aff_x );
			my $dis_y = abs( $aut_y - $aff_y );
			# Distance between an author and an affiliation
			# my $distance = sqrt( $dis_x * $dis_x + $dis_y * $dis_y );

			# Check if it it the minimum distance in x axis
			if ($dis_x < $min_dist_x)
			{
				$min_dist_x	= $dis_x;
				$min_aff_x	= $aff;
			}

			# Check if it it the minimum distance in y axis
			if ($dis_y < $min_dist_y)
			{
				$min_dist_y	= $dis_y;
				$min_aff_y	= $aff;
			}
		}

		# and y affiliation
		foreach my $aff (keys %{ $aff_rc })
		{
			my @aff_tokens	= split /\s/, $aff;
			my $aff_nb		= join '|||', @aff_tokens;
		
			# Content
			$features .= $author_nb . "#" . $aff_nb . "\t";

			my $signal = undef;
			# Signal
			if ((scalar(@{ $aut_rc->{ $author }->signals }) == 0) || (scalar(@{ $aff_rc->{ $aff }->signals }) == 0))
			{
				$signal = "diff";
			}
			else
			{
				my $matched = undef;
				# Check each author signal
				foreach my $aut_sig (@{ $aut_rc->{ $author }->signals })
				{
					# if it match with affiliation signal
					if ($aut_sig eq ${ $aff_rc->{ $aff }->signals }[ 0 ]) { $matched = 1; last; }
				}

				$signal = (! defined $matched) ? "diff" : "same";
			}
			
			# Signal
			$features .= $signal . "\t";

			# Same page
			my $page = ($aut_rc->{ $author }->page == $aff_rc->{ $aff }->page) ? "yes" : "no";
			$features .= $page . "\t";

			my $section = undef;
			# Same section
			if ($page eq "yes")
			{
				$section = ($aut_rc->{ $author }->section == $aff_rc->{ $aff }->section) ? "yes" : "no";
				$features .= $section . "\t";
			}
			else
			{
				$section = "no";
				$features .= $section . "\t";
			}

			my $para = undef;
			# Same paragraph
			if (($page eq "yes") && ($section eq "yes"))
			{
				$para = ($aut_rc->{ $author }->para == $aff_rc->{ $aff }->para) ? "yes" : "no";
				$features .= $para . "\t";
			}
			else
			{
				$para = "no";
				$features .= $para . "\t";
			}

			my $line = undef;
			# Same line
			if (($page eq "yes") && ($section eq "yes") && ($para eq "yes"))
			{
				$line = ($aut_rc->{ $author }->line == $aff_rc->{ $aff }->line) ? "yes" : "no";
				$features .= $line . "\t";
			}
			else
			{
				$line = "no";
				$features .= $line . "\t";
			}

			# Is neartest affiliation in x axis ?
			my $nearest_x = ($aff eq $min_aff_x) ? "yes" : "no";
			$features 	 .= $nearest_x . "\t";

			# Is neartest affiliation in y axis ?
			my $nearest_y = ($aff eq $min_aff_y) ? "yes" : "no";
			$features 	 .= $nearest_y . "\n";
		}
	}

	return $features; 
}

# Actually do the matching between author and affiliation
sub AAMatchingImp
{
	my ($features) = @_;	

	# Temporary input file for CRF
	my $infile	= BuildTmpFile("aa-input");
	# Temporary output file for CRF
	my $outfile	= BuildTmpFile("aa-output");

	my $output_handle = undef;
	# Split and write to temporary input
	open $output_handle, ">:utf8", $infile;
	# Split
	my @lines = split /\n/, $features;
	# and write
	foreach my $line (@lines) 
	{ 
		if ($line eq "")
		{
			print $output_handle "\n";	
		}
		else
		{
			print $output_handle $line, "\t", "no", "\n"; 
		}
	}
	# Done
	close $output_handle;

	# AA matching model
	my $match_model = $SectLabel::Config::matFile; 

	# Matching
  	system("$crft -m $match_model $infile > $outfile");

	# List of authors and their affiliation (if exists)
	my %aa = ();

	my $input_handle = undef;
	# Read the CRF output
	open $input_handle, "<:utf8", $outfile;
	# Read each line and get its label
	while (<$input_handle>)
	{
		my $line = $_;
		# Trim
		$line =~ s/^\s+|\s+$//g;
		# Blank linem, what the heck ?
		if ($line eq "") { next; }

		# Split the line
		my @fields	= split /\t/, $line;
		# and extract the class and the content
		my $class	= $fields[ -1 ];
		my $content	= $fields[ 0 ];

		# You miss 
		if ($class ne "yes") { next; }

		# Split the content into author name and affiliation name
		my @tmp		= split /#/, $content;
		# Author name
		my $author	= $tmp[ 0 ];
		$author		=~ s/\|\|\|/ /g;
		# Affiliation name
		my $aff		= $tmp[ 1 ];
		$aff		=~ s/\|\|\|/ /g;

		# Save
		if (! exists $aa{ $author }) { $aa{ $author } = (); }
		# Save
		push @{ $aa{ $author } }, $aff;
	}
	
	# Done
	close $input_handle;
	
	# Clean up
	unlink $infile;
	unlink $outfile;
	# Done
	return (\%aa);
}

# Extract affiliation and their signal using crf
sub AffiliationExtraction
{
	my ($features, $rc_features) = @_;

	# Temporary input file for CRF
	my $infile	= BuildTmpFile("aff-input");
	# Temporary output file for CRF
	my $outfile	= BuildTmpFile("aff-output");

	my $output_handle = undef;
	# Split and write to temporary input
	open $output_handle, ">:utf8", $infile;
	# Split
	my @lines = split /\n/, $features;
	# and write
	foreach my $line (@lines) 
	{
		if ($line eq "")
		{
			print $output_handle "\n";	
		}
		else
		{
			print $output_handle $line, "\t", "affiliation", "\n"; 
		}
	}
	# Done
	close $output_handle;

	# Author model
	my $aff_model = $SectLabel::Config::affFile; 

	# Split the authors
  	system("$crft -m $aff_model $infile > $outfile");

	# Each affiliation can have only one signal
	my %asg = ();
	# Each affilitiaon can have only one struct
	my %aaf	= ();
	# List of all affiliations
	my @aff = ();

	# Each line in the relational features string
	my @rc_lines = split /\n/, $rc_features;

	my $input_handle = undef;
	# Read the CRF output
	open $input_handle, "<:utf8", $outfile;
	# Author and signal string
	my $prev_class	= "";
	my @aff_str		= (); 
	my $signal_str	= "";
	# Relational classifier
	my @aaf_rc		= ();
	# Line counter
	my $counter		= 0;
	# Next to last signal
	my $ntl_signal	= "";
	# Read each line and get its label
	# TODO: The code assumes that an affiliation will have the following format: 1 foobar institute
	while (<$input_handle>)
	{
		my $line = $_;
		# Trim
		$line =~ s/^\s+|\s+$//g;
		# Blank line mark the end of an affiliation section
		if ($line eq "")
		{
			if ($prev_class eq "affiliation")
			{
				my ($affiliation, $rcs) = NormalizeAffiliationName(\@aff_str, \@aaf_rc);
				# Save the affiliation
				push @aff, $affiliation;
				# and its signal
				if ($ntl_signal ne "") { $asg{ $ntl_signal } = $affiliation; }

				# Save the signal
				push @{ $rcs->signals }, $ntl_signal;
				# Save the record
				$aaf{ $affiliation } = $rcs;
			}
			elsif ($prev_class eq "signal")
			{
				# Save the next to last signal
				$ntl_signal = NormalizeAffiliationSignal($signal_str);
			}

			# Cleanup
			$ntl_signal = "";
			# Cleanup
			@aff_str 	= ();
			$signal_str = "";
			$prev_class = "";
			# Cleanup
			@aaf_rc		= ();

			# Update the counter
			$counter++;

			next ;
		}

		# Split the line
		my @fields	= split /\t/, $line;
		# and extract the class and the content
		my $class	= $fields[ -1 ];
		my $content	= $fields[ 0 ];

		if ($class eq $prev_class)
		{
			# An affiliation
			if ($class eq "affiliation")
			{
				push @aff_str, $content;
				push @aaf_rc, $rc_lines[ $counter ];
			}
			# A signal
			elsif ($class eq "signal")
			{
				$signal_str .= $content . " ";	
			}		
		}
		else
		{
			if ($prev_class eq "affiliation")
			{
				my ($affiliation, $rcs) = NormalizeAffiliationName(\@aff_str, \@aaf_rc);
				# Save the affiliation
				push @aff, $affiliation;
				# and its signal
				if ($ntl_signal ne "") { $asg{ $ntl_signal } = $affiliation; }

				# Save the signal
				push @{ $rcs->signals }, $ntl_signal;
				# Save the record
				$aaf{ $affiliation } = $rcs;

			}
			elsif ($prev_class eq "signal")
			{
				# Save the next to last signal
				$ntl_signal = NormalizeAffiliationSignal($signal_str);
			}

			# Cleanup
			@aff_str 	= ();
			$signal_str = "";
			@aaf_rc		= ();
			# Switch to the current class
			$prev_class = $class;

			if ($class eq "affiliation")
			{
				push @aff_str, $content;
				push @aaf_rc, $rc_lines[ $counter ];
			}
			elsif ($class eq "signal")
			{
				$signal_str .= $content . " ";	
			}
		}

		# Update the counter
		$counter++;
	}

	# Final class
	if ($prev_class eq "affiliation")
	{
		my ($affiliation, $rcs) = NormalizeAffiliationName(\@aff_str, \@aaf_rc);
		# Save the affiliation
		push @aff, $affiliation;
		# and its signal
		if ($ntl_signal ne "") { $asg{ $ntl_signal } = $affiliation; }

		# Save the signal
		push @{ $rcs->signals }, $ntl_signal;
		# Save the record
		$aaf{ $affiliation } = $rcs;
	}
	elsif ($prev_class eq "signal")
	{
		# Save the next to last signal
		$ntl_signal = NormalizeAffiliationSignal($signal_str);
	}

	# Done
	close $input_handle;
	
	# Clean up
	unlink $infile;
	unlink $outfile;
	# Done
	return (\%asg, \%aaf, \@aff);
}

sub NormalizeAffiliationSignal
{
	my ($signal_str) = @_;

	# Trim
	$signal_str =~ s/^\s+|\s+$//g;
	# Remove all space inside the signature
	$signal_str =~ s/\s+//g;
	
	# Done
	return $signal_str;
}

sub NormalizeAffiliationName
{
	my ($aff_str, $aaf_rc) = @_;

	# Constraint
	if (scalar(@{ $aff_str }) != scalar(@{ $aaf_rc })) { print STDERR "# It cannot happen, if you encounter it, please consider report it as a bug", "\n"; die; }

	# Affiliation string
	my $affiliation = join ' ', @{ $aff_str };

	# First word
	my @fields = split /\s/, $aaf_rc->[ 0 ];
	# Save the relational features of an affiliation (its first word)
	my $rcs	= aff_rcfeatures->new(	signals => [],
									top => $fields[ 1 ], bottom => $fields[ 2 ], left => $fields[ 3 ], right => $fields[ 4 ],
									page => $fields[ 5 ], section => $fields[ 6 ], para => $fields[ 7 ], line => $fields[ 8 ]	);
	# Done
	return ($affiliation, $rcs);
}

# Extract author name and their signal using crf
sub AuthorExtraction
{
	my ($features, $rc_features) = @_;

	# Temporary input file for CRF
	my $infile	= BuildTmpFile("aut-input");
	# Temporary output file for CRF
	my $outfile	= BuildTmpFile("aut-output");

	my $output_handle = undef;
	# Split and write to temporary input
	open $output_handle, ">:utf8", $infile;
	# Split
	my @lines = split /\n/, $features;
	# and write
	foreach my $line (@lines) 
	{ 
		if ($line eq "")
		{
			print $output_handle "\n";	
		}
		else
		{
			print $output_handle $line, "\t", "ns", "\n"; 
		}
	}
	# Done
	close $output_handle;

	# Author model
	my $author_model = $SectLabel::Config::autFile; 

	# Split the authors
  	system("$crft -m $author_model $infile > $outfile");

	# Each author can have one or more signals
	my %asg = ();
	# Each author can have only one struct	
	my %aas = ();

	# Each line in the relational features string
	my @rc_lines = split /\n/, $rc_features;

	my $input_handle = undef;
	# Read the CRF output
	open $input_handle, "<:utf8", $outfile;
	# Author and signal string
	my $prev_class	= "";
	my @author_str	= ();
	my $signal_str	= "";
	# Relational classifier
	my @author_rc	= ();
	# Line counter
	my $counter		= 0;
	# Next to last authors
	my %ntl_asg 	= ();
	#
	my $is_authors	= 0;
	# Read each line and get its label
	while (<$input_handle>)
	{
		my $line = $_;
		# Trim
		$line =~ s/^\s+|\s+$//g;
		# Blank line mark the end of an author section
		if ($line eq "") 
		{ 
			if ($prev_class eq "author")
			{
				my ($authors, $rcs) = NormalizeAuthorNames(\@author_str, \@author_rc);
				# Save each author
				for (my $i = 0; $i < scalar(@{ $authors }); $i++)
				{
					$asg{ $authors->[ $i ] } 		= ();
					$aas{ $authors->[ $i ] }		= $rcs->[ $i ];
					$ntl_asg{ $authors->[ $i ] }	= 0;
				}
			}
			elsif ($prev_class eq "signal")
			{
				my $signals = NormalizeAuthorSignal($signal_str);
				# Save each signal to its corresponding author
				foreach my $author (keys %ntl_asg)
				{
					foreach my $signal (@{ $signals })
					{
						push @{ $asg{ $author } }, $signal;
						push @{ $aas{ $author }->signals }, $signal;
					}
				}
			}

			# Cleanup 
			%ntl_asg = ();
			# Cleanup
			@author_str = ();
			$signal_str = "";
			@author_rc	= ();
			# Cleanup
			$prev_class = "";

			# Update the counter
			$counter++;

			#
			$is_authors = 0;

			next; 
		}
		
		# Split the line
		my @fields	= split /\t/, $line;
		# and extract the class and the content
		my $class	= $fields[ -1 ];
		my $content	= $fields[ 0 ];

		if ($class eq $prev_class)
		{
			# An author
			if ($class eq "author")
			{
				push @author_str, $content;
				push @author_rc, $rc_lines[ $counter ];
			}
			# A signal
			elsif ($class eq "signal")
			{
				$signal_str .= $content . " ";	
			}		
		}
		else
		{
			if ($prev_class eq "author")
			{
				my ($authors, $rcs) = NormalizeAuthorNames(\@author_str, \@author_rc);
				# Save each author
				for (my $i = 0; $i < scalar(@{ $authors }); $i++)
				{
					$asg{ $authors->[ $i ] } 		= ();
					$aas{ $authors->[ $i ] }		= $rcs->[ $i ];
					$ntl_asg{ $authors->[ $i ] }	= 0;
				}
			}
			elsif ($prev_class eq "signal")
			{
				my $signals = NormalizeAuthorSignal($signal_str);
				# Save each signal to its corresponding author
				foreach my $author (keys %ntl_asg)
				{
					foreach my $signal (@{ $signals })
					{
						push @{ $asg{ $author } }, $signal;
						push @{ $aas{ $author }->signals }, $signal;
					}
				}
			}

			# Clean the next to last author list if this current class is author
			if (($is_authors == 0) && ($class eq "author")) { %ntl_asg = (); $is_authors = 1; }
			#
			if ($class eq "signal") { $is_authors = 0; }

			# Cleanup
			@author_str = ();
			$signal_str = "";
			@author_rc	= ();
			# Switch to the current class
			$prev_class = $class;

			if ($class eq "author")
			{
				push @author_str, $content;
				push @author_rc, $rc_lines[ $counter ];
			}
			elsif ($class eq "signal")
			{
				$signal_str .= $content . " ";	
			}
		}

		# Update the counter
		$counter++;
	}

	# Final class
	if ($prev_class eq "author")
	{
		my ($authors, $rcs) = NormalizeAuthorNames(\@author_str, \@author_rc);
		# Save each author
		for (my $i = 0; $i < scalar(@{ $authors }); $i++)
		{
			$asg{ $authors->[ $i ] } 		= ();
			$aas{ $authors->[ $i ] }		= $rcs->[ $i ];
			$ntl_asg{ $authors->[ $i ] }	= 0;
		}
	}
	elsif ($prev_class eq "signal")
	{
		my $signals = NormalizeAuthorSignal($signal_str);
		# Save each signal to its corresponding author
		foreach my $author (keys %ntl_asg)
		{
			foreach my $signal (@{ $signals })
			{
				push @{ $asg{ $author } }, $signal;
				push @{ $aas{ $author }->signals }, $signal;
			}
		}
	}

	# Done
	close $input_handle;
	
	# Clean up
	unlink $infile;
	unlink $outfile;
	# Done
	return (\%asg, \%aas);
}

sub NormalizeAuthorNames
{
	my ($author_str, $author_rc) = @_;

	# Constraint
	if (scalar(@{ $author_str }) != scalar(@{ $author_rc })) { print STDERR "# It cannot happen, if you encounter it, please consider report it as a bug", "\n"; die; }

	# Mark the beginning of an author name
	my $begin	= 1;
	# and its corresponding relational features
	my $rcbegin	= 0;

	my @current	= ();
	my @authors	= ();
	my @rcs		= ();
	# Check all tokens in the author string
	for (my $i = 0; $i < scalar(@{ $author_str }); $i++)
	{
		my $token = $author_str->[ $i ];
	
		# Mark the end of an author name
		if ($token =~ m/^(&|and|,|;)$/i) 
		{
	    	if (scalar(@current) != 0) 
			{ 
				push @authors, ParsCit::PostProcess::NormalizeAuthorName(@current);

				# Save the relational features of an author (its first word)
				my @fields = split /\s/, $author_rc->[ $rcbegin ];
				# Create new record
				my $tmp	= aut_rcfeatures->new(	signals => [], 
												top => $fields[ 1 ], bottom => $fields[ 2 ], left => $fields[ 3 ], right => $fields[ 4 ],
												page => $fields[ 5 ], section => $fields[ 6 ], para => $fields[ 7 ], line => $fields[ 8 ]	);
				# Save the record
				push @rcs, $tmp;
			}

			# Cleanup
	    	@current	= ();
	    	$begin		= 1;

	    	next;
		}

		# Mark the begin of an author name
		if ($begin == 1) 
		{
	    	push @current, $token;

	    	$begin 	 = 0;
			$rcbegin = $i;

	    	next;
		}

		# Author name ending with a comma
		if ($token =~ m/,$/) 
		{
	    	push @current, $token;

			if (scalar(@current) != 0) 
			{ 
				push @authors, ParsCit::PostProcess::NormalizeAuthorName(@current);

				# Save the relational features of an author (its first word)
				my @fields = split /\s/, $author_rc->[ $rcbegin ];
				# Create new record
				my $tmp	= aut_rcfeatures->new(	signals => [], 
												top => $fields[ 1 ], bottom => $fields[ 2 ], left => $fields[ 3 ], right => $fields[ 4 ],
												page => $fields[ 5 ], section => $fields[ 6 ], para => $fields[ 7 ], line => $fields[ 8 ]	);
				# Save the record
				push @rcs, $tmp;
			}
			
			# Cleanup
	    	@current	= ();
	    	$begin		= 1;
		}
		# or it's just parts of the name
		else 
		{
	    	push @current, $token;
		}
	}

	# Last author name
	if (scalar(@current) != 0) 
	{
		push @authors, ParsCit::PostProcess::NormalizeAuthorName(@current);

		# Save the relational features of an author (its first word)
		my @fields = split /\s/, $author_rc->[ $rcbegin ];
		# Create new record
		my $tmp	= aut_rcfeatures->new(	signals => [], 
										top => $fields[ 1 ], bottom => $fields[ 2 ], left => $fields[ 3 ], right => $fields[ 4 ],
										page => $fields[ 5 ], section => $fields[ 6 ], para => $fields[ 7 ], line => $fields[ 8 ]	);
		# Save the record
		push @rcs, $tmp;
    }

	# Done
	return (\@authors, \@rcs); 
}

# 
sub NormalizeAuthorSignal
{
	my ($signal_str) = @_;

	# Trim
	$signal_str =~ s/^\s+|\s+$//g;
	# Split into individual signal
	my @signals = split / |,|:|;/, $signal_str;
	
	# Done
	return \@signals; 
}

# Extract features from affiliation lines
# The list of features include
# Content
# Content, lower case, no punctuation
# Content length
# First word in line
#
# XML features
# Subscript, superscript
# Bold
# Italic
# Underline
# Relative font size
# Differentiate features
sub AffiliationFeatureExtraction
{
	my ($aff_lines, $aff_addrs) = @_;

	# NOTE: Relational classifier features
	my $rc_features		= "";

	# Features will be stored here
	my $features 		= "";
	# First word in line
	my $is_first_line	= undef;

	# Font size
	my %fonts = ();
	# Each line contains many runs
	foreach my $line (@{ $aff_lines })
	{
		my $runs = $line->get_objs_ref();
		# Iterator though all work in all lines
		foreach my $run (@{ $runs })
		{
			my $fsize = $run->get_font_size();
			my $words = $run->get_objs_ref();

			# Statistic
			if (! exists $fonts{ $fsize })
			{
				$fonts{ $fsize } = scalar(@{ $words });
			}
			else
			{
				$fonts{ $fsize } += scalar(@{ $words });
			}
		}
	}

	my $dominate_font = undef;
	# Sort all the font descend with the number of their appearance
	my @sorted = sort { $fonts{ $b } <=> $fonts{ $a } } keys %fonts;
	# Select the dominated font
	$dominate_font = $sorted[ 0 ];

	my $size_mismatch = undef;
	# TODO: serious error if the size of aff_lines and the size of aff_addrs mismatch
	if (scalar(@{ $aff_lines }) != scalar(@{ $aff_addrs })) 
	{ 	
		$size_mismatch = 1;
		# Print the error but still try to continue
		print STDERR "# Total number of affiliation lines (" . scalar(@{ $aff_lines }) . ") != Total number of affiliation addresses (" . scalar(@{ $aff_addrs }) . ")." . "\n";
	}

	my $prev_page = undef;
	my $prev_sect = undef;
	my $prev_para = undef;
	# Each line contains many runs
	for (my $counter = 0; $counter < scalar(@{ $aff_lines }); $counter++)
	{
		# Get the line object
		my $line = $aff_lines->[ $counter ];

		# Check the size of aff_lines and aff_addrs
		if (! defined $size_mismatch)
		{
			# Check if two consecutive lines are from two different sections 
			if (! defined $prev_page)
			{
				# Init
				$prev_page = $aff_addrs->[ $counter ]->{ 'L1' };
				$prev_sect = $aff_addrs->[ $counter ]->{ 'L2' };
				$prev_para = $aff_addrs->[ $counter ]->{ 'L3' };
			}
			else
			{
				# Affiliations from different sections will be separated immediately
				if (($prev_page != $aff_addrs->[ $counter ]->{ 'L1' }) ||
					($prev_sect != $aff_addrs->[ $counter ]->{ 'L2' }) ||
					($prev_para != $aff_addrs->[ $counter ]->{ 'L3' })) 
				{ 
					$features .= "\n"; 
				
					# NOTE: Relational classifier features
					$rc_features .= "\n";
				}

				# Save the paragraph index
				$prev_page = $aff_addrs->[ $counter ]->{ 'L1' };
				$prev_sect = $aff_addrs->[ $counter ]->{ 'L2' };
				$prev_para = $aff_addrs->[ $counter ]->{ 'L3' };
			}
		}

		# Set first word in line
		$is_first_line = 1;

		# Two previous words
		my $prev_word		= undef;
		my $prev_prev_word	= undef;

		# Format of the previous word
		my ($prev_bold, $prev_italic, $prev_underline, $prev_suscript, $prev_fontsize) = "unknown";

		my $runs = $line->get_objs_ref();
		# Iterator though all work in all lines
		foreach my $run (@{ $runs })
		{
			# The run must be non-empty
			my $tmp = $run->get_content();
			# Trim
			$tmp	=~ s/^\s+|\s+$//g;
			# Skip blank run
			if ($tmp eq "") { next; }

			###
			# The following features are XML features
			###
			
			# Bold format	
			my $bold = ($run->get_bold() eq "true") ? "bold" : "none";
			
			# Italic format	
			my $italic = ($run->get_italic() eq "true") ? "italic" : "none";

			# Underline
			my $underline = ($run->get_underline() eq "true") ? "underline" : "none";

			# Sub-Sup-script
			my $suscript =	($run->get_suscript() eq "superscript")	? "super"	:
							($run->get_suscript() eq "subscript")	? "sub"		: "none";

			# Relative font size
			my $fontsize =	($run->get_font_size() > $dominate_font)	? "large"	:
							($run->get_font_size() < $dominate_font)	? "small"	: "normal";
			
			###
			# End of XML features
			###

			# All words in the run
			my $words = $run->get_objs_ref();

			# For each word
			foreach my $word (@{ $words })
			{
				# Get word location
				my $top 	= $word->get_top_pos();
				my $bottom 	= $word->get_bottom_pos();
				my $left	= $word->get_left_pos();
				my $right	= $word->get_right_pos();

				# NOTE: heuristic rule, for words in the same line
				# If the x-axis distance between this word and the previous word is
				# three times larger than the distance between the previous word and
				# the word before it, then it marks the separator.
				# The better way to do this is to introduce it as a new feature in the
				# author and affiliation model but this step requires re-training these
				# two models, so ...
				#
				# NOTE: Assuming left to right writing
				if (! defined $prev_word)
				{
					$prev_word = $word;	
				}
				elsif (! defined $prev_prev_word)
				{
					# NOTE: Words have the power to both destroy and heal, when words are both
					# true and kind, they can change our world
					if (($prev_word->get_left_pos() != $word->get_left_pos()) && ($prev_word->get_right_pos() != $word->get_right_pos()))
					{
						$prev_prev_word = $prev_word;
						$prev_word		= $word;
					}
				}
				else
				{
					# NOTE: Words have the power to both destroy and heal, when words are both
					# true and kind, they can change our world
					if (($prev_word->get_left_pos() != $word->get_left_pos()) && ($prev_word->get_right_pos() != $word->get_right_pos()))
					{
						my $prev_dist = abs ($prev_word->get_left_pos() - $prev_prev_word->get_right_pos());
						my $curr_dist = abs ($word->get_left_pos() - $prev_word->get_right_pos());

						if ($prev_dist * 5 < $curr_dist)
						{
							$features .= "\n"; 
				
							# NOTE: Relational classifier features
							$rc_features .= "\n";
						}

						$prev_prev_word = $prev_word;
						$prev_word		= $word;
					}
				}

				# Extract features
				my $full_content = $word->get_content();
				# Trim
				$full_content	 =~ s/^\s+|\s+$//g;

				# Skip blank run
				if ($full_content eq "") { next; }

				my @sub_content = ();
				# This is the tricky part, one word e.g. **affiliation will be 
				# splitted into two parts: the signal, and the affiliation if 
				# possible using regular expression
				while ($full_content =~ m/([\w|-]*)(\W*)/g)
				{
					my $first	= $1;
					my $second	= $2;
						
					# Trim
					$first	=~ s/^\s+|\s+$//g;
					$second	=~ s/^\s+|\s+$//g;
				
					# Only keep non-blank content
					if ($first ne "") { push @sub_content, $first; }

					# Check the signal and separator
					while ($second =~ m/([,|\.|:|;]*)([^,\.:;]*)/g)
					{
						my $sub_first	= $1;
						my $sub_second	= $2;

						# Trim
						$sub_first	=~ s/^\s+|\s+$//g;
						$sub_second	=~ s/^\s+|\s+$//g;
						
						# Only keep non-blank separator
						if ($sub_first ne "") { push @sub_content, $sub_first; }
						# Only keep non-blank signal
						if ($sub_second ne "") { push @sub_content, $sub_second; }
					}
				}

				foreach my $content (@sub_content)
				{
					# Content
					$features .= $content . "\t";
			
					my $content_n	= $content;
					# Remove punctuation
					$content_n		=~ s/[^\w]//g;
					# Lower case
					my $content_l	= lc($content);
					# Lower case, no punctuation
					my $content_nl	= lc($content_n);
					# Lower case
					$features .= $content_l . "\t";
					# Lower case, no punctuation
					if ($content_nl ne "")
					{
						$features .= $content_nl . "\t";
					}
					else
					{
						$features .= $content_l . "\t";
					}

					# Split into character
		      		my @chars = split(//, $content);
					# Content length
					my $length =	(scalar(@chars) == 1)	? "1-char"	:
									(scalar(@chars) == 2)	? "2-char"	:
									(scalar(@chars) == 3)	? "3-char"	: "4+char";
					$features .= $length . "\t";
							
					# First word in line
					if ($is_first_line == 1)
					{
						$features .= "begin" . "\t";
		
						# Next words are not the first in line anymore
						$is_first_line = 0;
					}
					else	
					{
						$features .= "continue" . "\t";
					}		

					###
					# The following features are XML features
					###
			
					# Bold format	
					$features .= $bold . "\t";
			
					# Italic format	
					$features .= $italic . "\t";

					# Underline
					$features .= $underline . "\t";

					# Sub-Sup-script
					$features .= $suscript . "\t";

					# Relative font size
					$features .= $fontsize . "\t";

					# First word in run
					if (($prev_bold ne $bold) || ($prev_italic ne $italic) || ($prev_underline ne $underline) || ($prev_suscript ne $suscript) || ($prev_fontsize ne $fontsize))
					{
						$features .= "fbegin" . "\t";
					}
					else	
					{
						$features .= "fcontinue" . "\t";
					}

					# New token
					$features .= "\n";

					# Save the XML format
					$prev_bold		= $bold;
					$prev_italic	= $italic;
					$prev_underline	= $underline;
					$prev_suscript	= $suscript;
					$prev_fontsize	= $fontsize;

					# NOTE: Relational classifier features
					# Content
					$rc_features .= $content . "\t";
					# Location
					$rc_features .= $top 	. "\t";
					$rc_features .= $bottom . "\t";
					$rc_features .= $left 	. "\t";
					$rc_features .= $right	. "\t";
					# Index
					if (! defined $size_mismatch)
					{
						$rc_features .= $aff_addrs->[ $counter ]->{ 'L1' } . "\t";
						$rc_features .= $aff_addrs->[ $counter ]->{ 'L2' } . "\t";
						$rc_features .= $aff_addrs->[ $counter ]->{ 'L3' } . "\t";
						$rc_features .= $aff_addrs->[ $counter ]->{ 'L4' } . "\t";
					}
					# Done
					$rc_features .= "\n";
				}
			}			
		}
	}

	return ($features, $rc_features);

}

# Extract features from author lines
# The list of features include
# Content
# Content, lower case, no punctuation
# Content length
# Capitalization
# Numeric property
# Last punctuation
# First 4-gram
# Last 4-gram
# Dictionary
# First word in line
#
# XML features
# Subscript, superscript
# Bold
# Italic
# Underline
# Relative font size
# Differentiate features
sub AuthorFeatureExtraction
{
	my ($aut_lines, $aut_addrs) = @_;

	# NOTE: Relational classifier features
	my $rc_features		= "";
	
	# Features will be stored here
	my $features 		= "";
	# First word in line
	my $is_first_line	= undef;
	# First word in run
	# my $is_first_run	= undef;

	# Font size
	my %fonts = ();
	# Each line contains many runs
	foreach my $line (@{ $aut_lines })
	{
		my $runs = $line->get_objs_ref();
		# Iterator though all work in all lines
		foreach my $run (@{ $runs })
		{
			my $fsize = $run->get_font_size();
			my $words = $run->get_objs_ref();

			# Statistic
			if (! exists $fonts{ $fsize })
			{
				$fonts{ $fsize } = scalar(@{ $words });
			}
			else
			{
				$fonts{ $fsize } += scalar(@{ $words });
			}
		}
	}

	my $dominate_font = undef;
	# Sort all the font descend with the number of their appearance
	my @sorted = sort { $fonts{ $b } <=> $fonts{ $a } } keys %fonts;
	# Select the dominated font
	$dominate_font = $sorted[ 0 ];

	my $size_mismatch = undef;
	# TODO: serious error if the size of aut_lines and the size of aut_addrs mismatch
	if (scalar(@{ $aut_lines }) != scalar(@{ $aut_addrs })) 
	{ 	
		$size_mismatch = 1;
		# Print the error but still try to continue
		print STDERR "# Total number of author lines (" . scalar(@{ $aut_lines }) . ") != Total number of author addresses (" . scalar(@{ $aut_addrs }) . ")." . "\n";
	}

	my $prev_page = undef;
	my $prev_sect = undef;
	my $prev_para = undef;
	# Each line contains many runs
	for (my $counter = 0; $counter < scalar(@{ $aut_lines }); $counter++)
	{
		# Get the line object
		my $line = $aut_lines->[ $counter ];
		
		# Check the size of aut_line and aut_addrs
		if (! defined $size_mismatch)
		{
			# Check if two consecutive lines are from two different sections 
			if (! defined $prev_page)
			{
				# Init
				$prev_page = $aut_addrs->[ $counter ]->{ 'L1' };
				$prev_sect = $aut_addrs->[ $counter ]->{ 'L2' };
				$prev_para = $aut_addrs->[ $counter ]->{ 'L3' };
			}
			else
			{
				# Authors from different sections will be separated immediately
				if (($prev_page != $aut_addrs->[ $counter ]->{ 'L1' }) ||
					($prev_sect != $aut_addrs->[ $counter ]->{ 'L2' }) ||
					($prev_para != $aut_addrs->[ $counter ]->{ 'L3' }))
				{ 
					$features .= "\n"; 

					# NOTE: Relational classifier features
					$rc_features .= "\n";
				}
				
				# Save the paragraph index
				$prev_page = $aut_addrs->[ $counter ]->{ 'L1' };
				$prev_sect = $aut_addrs->[ $counter ]->{ 'L2' };
				$prev_para = $aut_addrs->[ $counter ]->{ 'L3' };
			}
		}

		# Set first word in line
		$is_first_line = 1;

		# Previous word and the word before this 
		my $prev_prev_word	= undef;
		my $prev_word		= undef;

		# Format of the previous word
		my ($prev_bold, $prev_italic, $prev_underline, $prev_suscript, $prev_fontsize) = "unknown";

		my $runs = $line->get_objs_ref();
		# Iterator though all work in all lines
		foreach my $run (@{ $runs })
		{
			# The run must be non-empty
			my $tmp = $run->get_content();
			# Trim
			$tmp	=~ s/^\s+|\s+$//g;
			# Skip blank run
			if ($tmp eq "") { next; }

			# Set first word in run
			# $is_first_run = 1; 

			###
			# The following features are XML features
			###
			
			# Bold format	
			my $bold = ($run->get_bold() eq "true") ? "bold" : "none";
			
			# Italic format	
			my $italic = ($run->get_italic() eq "true") ? "italic" : "none";

			# Underline
			my $underline = ($run->get_underline() eq "true") ? "underline" : "none";

			# Sub-Sup-script
			my $suscript =	($run->get_suscript() eq "superscript")	? "super"	:
							($run->get_suscript() eq "subscript")	? "sub"		: "none";

			# Relative font size
			my $fontsize =	($run->get_font_size() > $dominate_font)	? "large"	:
							($run->get_font_size() < $dominate_font)	? "small"	: "normal";
			
			###
			# End of XML features
			###

			# All words in the run
			my $words = $run->get_objs_ref();

			# For each word
			foreach my $word (@{ $words })
			{
				# Get word location
				my $top 	= $word->get_top_pos();
				my $bottom 	= $word->get_bottom_pos();
				my $left	= $word->get_left_pos();
				my $right	= $word->get_right_pos();

				# NOTE: heuristic rule, for words in the same line
				# If the x-axis distance between this word and the previous word is
				# three times larger than the distance between the previous word and
				# the word before it, then it marks the separator.
				# The better way to do this is to introduce it as a new feature in the
				# author and affiliation model but this step requires re-training these
				# two models, so ...
				#
				# NOTE: Assuming left to right writing
				if (! defined $prev_word)
				{
					$prev_word = $word;	
				}
				elsif (! defined $prev_prev_word)
				{
					# NOTE: Words have the power to both destroy and heal, when words are both
					# true and kind, they can change our world
					if (($prev_word->get_left_pos() != $word->get_left_pos()) && ($prev_word->get_right_pos() != $word->get_right_pos()))
					{
						$prev_prev_word = $prev_word;
						$prev_word		= $word;
					}
				}
				else
				{
					# NOTE: Words have the power to both destroy and heal, when words are both
					# true and kind, they can change our world
					if (($prev_word->get_left_pos() != $word->get_left_pos()) && ($prev_word->get_right_pos() != $word->get_right_pos()))
					{

						my $prev_dist = abs ($prev_word->get_left_pos() - $prev_prev_word->get_right_pos());
						my $curr_dist = abs ($word->get_left_pos() - $prev_word->get_right_pos());

						if ($prev_dist * 5 < $curr_dist)
						{
							$features .= "\n"; 
					
							# NOTE: Relational classifier features
							$rc_features .= "\n";
						}

						$prev_prev_word = $prev_word;
						$prev_word		= $word;
					}
				}

				# Extract features
				my $full_content = $word->get_content();
				# Trim
				$full_content	 =~ s/^\s+|\s+$//g;

				# Skip blank run
				if ($full_content eq "") { next; }

				my @sub_content = ();
				# This is the tricky part, one word e.g. name** will be splitted 
				# into several parts: the name, the signal, and the separator if 
				# possible using regular expression
				while ($full_content =~ m/([\w|-]*)(\W*)/g)
				{
					my $first	= $1;
					my $second	= $2;
						
					# Trim
					$first	=~ s/^\s+|\s+$//g;
					$second	=~ s/^\s+|\s+$//g;
				
					# Only keep non-blank content
					if ($first ne "") { push @sub_content, $first; }

					# Check the signal and separator
					while ($second =~ m/([,|\.|:|;]*)([^,\.:;]*)/g)
					{
						my $sub_first	= $1;
						my $sub_second	= $2;

						# Trim
						$sub_first	=~ s/^\s+|\s+$//g;
						$sub_second	=~ s/^\s+|\s+$//g;
						
						# Only keep non-blank separator
						if ($sub_first ne "") { push @sub_content, $sub_first; }
						# Only keep non-blank signal
						if ($sub_second ne "") { push @sub_content, $sub_second; }
					}
				}

				foreach my $content (@sub_content)
				{
					# Content
					$features .= $content . "\t";
				
					my $content_n	= $content;
					# Remove punctuation
					$content_n		=~ s/[^\w]//g;
					# Lower case
					my $content_l	= lc($content);
					# Lower case, no punctuation
					my $content_nl	= lc($content_n);
					# Lower case
					$features .= $content_l . "\t";
					# Lower case, no punctuation
					if ($content_nl ne "")
					{
						$features .= $content_nl . "\t";
					}
					else
					{
						$features .= $content_l . "\t";
					}

					# Capitalization
					my $ortho = ($content =~ /^[\p{IsUpper}]$/)					? "single"	:
								($content =~ /^[\p{IsUpper}][\p{IsLower}]+/)	? "init" 	:
								($content =~ /^[\p{IsUpper}]+$/) 				? "all" 	: "others";
					$features .= $ortho . "\t";

					# Numeric property
					my $num =	($content =~ /^[0-9]$/)					? "1dig" 	:
								($content =~ /^[0-9][0-9]$/) 			? "2dig" 	:
								($content =~ /^[0-9][0-9][0-9]$/) 		? "3dig" 	:
								($content =~ /^[0-9]+$/) 				? "4+dig" 	:
								($content =~ /^[0-9]+(th|st|nd|rd)$/)	? "ordinal"	:
								($content =~ /[0-9]/) 					? "hasdig" 	: "nonnum";
					$features .= $num . "\t";

					# Last punctuation
					my $punct = ($content =~ /^[\"\'\`]/) 						? "leadq" 	:
								($content =~ /[\"\'\`][^s]?$/) 					? "endq" 	:
	  							($content =~ /\-.*\-/) 							? "multi"	:
	    						($content =~ /[\-\,\:\;]$/) 					? "cont" 	:
	      						($content =~ /[\!\?\.\"\']$/) 					? "stop" 	:
	        					($content =~ /^[\(\[\{\<].+[\)\]\}\>].?$/)		? "braces" 	: "others";
					$features .= $punct . "\t";

					# Split into character
		      		my @chars = split(//, $content);
					my $clen  = scalar @chars;
					# Content length
					my $length =	(scalar(@chars) == 1)	? "1-char"	:
									(scalar(@chars) == 2)	? "2-char"	:
									(scalar(@chars) == 3)	? "3-char"	: "4+char";
					$features .= $length . "\t";
					# First n-gram
					$features .= $chars[ 0 ] . "\t";
					if ($clen >= 2) {
						$features .= join("", @chars[ 0..1 ]) . "\t";
					} else {
						$features .= $length . "\t";
					}
					if ($clen >= 3) {
						$features .= join("", @chars[ 0..2 ]) . "\t";
					} elsif ($clen >= 2) {
						$features .= join("", @chars[ 0..1 ]) . "\t";
					} else {
						$features .= $length . "\t";
					}
					if ($clen >= 4) {
						$features .= join("", @chars[ 0..3 ]) . "\t";
					} elsif ($clen >= 3) {
						$features .= join("", @chars[ 0..2 ]) . "\t";
					} elsif ($clen >= 2) {
						$features .= join("", @chars[ 0..1 ]) . "\t";
					} else {
						$features .= $length . "\t";
					}
	      			# Last n-gram
					$features .= $chars[ -1 ] . "\t";
					if ($clen >= 2) {
						$features .= join("", @chars[ -2..-1 ]) . "\t";
					} else {
						$features .= $chars[ -1 ] . "\t";
					}
					if ($clen >= 3) {
						$features .= join("", @chars[ -3..-1 ]) . "\t";
					} elsif ($clen >= 2) {
						$features .= join("", @chars[ -2..-1 ]) . "\t";
					} else {
						$features .= $chars[ -1 ] . "\t";
					}
					if ($clen >= 4) {
						$features .= join("", @chars[ -4..-1 ]) . "\t";
					} elsif ($clen >= 3) {
						$features .= join("", @chars[ -3..-1 ]) . "\t";
					} elsif ($clen >= 2) {
						$features .= join("", @chars[ -2..-1 ]) . "\t";
					} else {
						$features .= $chars[ -1 ] . "\t";
					}
			
					# Dictionary
					my $dict_status = (defined $dict{ $content_nl }) ? $dict{ $content_nl } : 0;
					# Possible names
					my ($publisher_name, $place_name, $month_name, $last_name, $female_name, $male_name) = undef;
   					# Check all case 
					if ($dict_status >= 32) { $dict_status -= 32; 	$publisher_name	= "publisher"	} else { $publisher_name	= "no"; }
	    			if ($dict_status >= 16)	{ $dict_status -= 16; 	$place_name 	= "place" 		} else { $place_name 		= "no"; }
		    		if ($dict_status >= 8)	{ $dict_status -= 8; 	$month_name 	= "month" 		} else { $month_name 		= "no"; }
    				if ($dict_status >= 4)	{ $dict_status -= 4; 	$last_name 		= "last" 		} else { $last_name 		= "no"; }
	    			if ($dict_status >= 2) 	{ $dict_status -= 2; 	$female_name 	= "female" 		} else { $female_name 		= "no"; }
    				if ($dict_status >= 1) 	{ $dict_status -= 1; 	$male_name 		= "male" 		} else { $male_name 		= "no"; }
		    		# Save the feature
					$features .= $male_name 	 . "\t";
					$features .= $female_name 	 . "\t";
					$features .= $last_name 	 . "\t";
					$features .= $month_name 	 . "\t";
					$features .= $place_name 	 . "\t";
					$features .= $publisher_name . "\t";

					# First word in line
					if ($is_first_line == 1)
					{
						$features .= "begin" . "\t";
	
						# Next words are not the first in line anymore
						$is_first_line = 0;
					}
					else	
					{
						$features .= "continue" . "\t";
					}		

					###
					# The following features are XML features
					###
			
					# Bold format	
					$features .= $bold . "\t";
				
					# Italic format	
					$features .= $italic . "\t";

					# Underline
					$features .= $underline . "\t";

					# Sub-Sup-script
					$features .= $suscript . "\t";
	
					# Relative font size
					$features .= $fontsize . "\t";

					# First word in run
					if (($prev_bold ne $bold) || ($prev_italic ne $italic) || ($prev_underline ne $underline) || ($prev_suscript ne $suscript) || ($prev_fontsize ne $fontsize))
					{
						$features .= "fbegin" . "\t";
	
						# Next words are not the first in line anymore
						# $is_first_run = 0;
					}
					else	
					{
						$features .= "fcontinue" . "\t";
					}

					# New token
					$features .= "\n";
		
					# Save the XML format
					$prev_bold		= $bold;
					$prev_italic	= $italic;
					$prev_underline	= $underline;
					$prev_suscript	= $suscript;
					$prev_fontsize	= $fontsize;

					# NOTE: Relational classifier features
					# Content
					$rc_features .= $content . "\t";
					# Location
					$rc_features .= $top 	. "\t";
					$rc_features .= $bottom . "\t";
					$rc_features .= $left 	. "\t";
					$rc_features .= $right	. "\t";
					# Index
					if (! defined $size_mismatch)
					{
						$rc_features .= $aut_addrs->[ $counter ]->{ 'L1' } . "\t";
						$rc_features .= $aut_addrs->[ $counter ]->{ 'L2' } . "\t";
						$rc_features .= $aut_addrs->[ $counter ]->{ 'L3' } . "\t";
						$rc_features .= $aut_addrs->[ $counter ]->{ 'L4' } . "\t";
					}
					# Done
					$rc_features .= "\n";
				}
			}			
		}
	}

	return ($features, $rc_features);
}

sub ReadDict 
{
  	my ($dictfile) = @_;

	# Absolute path
	my $dictfile_abs = File::Spec->rel2abs($dictfile);
	# Dictionary handle
	my $dict_handle	 = undef;
  	open ($dict_handle, "<:utf8", $dictfile_abs) || die "Could not open dict file $dictfile_abs: $!";

	my $mode = 0;
  	while (<$dict_handle>) 
	{
    	if (/^\#\# Male/) 			{ $mode = 1; }		# male names
    	elsif (/^\#\# Female/) 		{ $mode = 2; }		# female names
    	elsif (/^\#\# Last/) 		{ $mode = 4; }		# last names
    	elsif (/^\#\# Chinese/) 	{ $mode = 4; }		# last names
    	elsif (/^\#\# Months/) 		{ $mode = 8; }		# month names
    	elsif (/^\#\# Place/) 		{ $mode = 16; }		# place names
    	elsif (/^\#\# Publisher/)	{ $mode = 32; }		# publisher names
    	elsif (/^\#/) { next; }
    	else 
		{
      		chop;
      		my $key = $_;
      		my $val = 0;
			# Has probability
      		if (/\t/) { ($key,$val) = split (/\t/,$_); }

      		# Already tagged (some entries may appear in same part of lexicon more than once
      		if (! exists $dict{ $key })
			{
				$dict{ $key } = $mode;
      		} 
			else 
			{
				if ($dict{ $key } >= $mode) 
				{ 
					next; 
				}
				# Not yet tagged
				else 
				{ 
					$dict{ $key } += $mode; 
				}
      		}
    	}
  	}
  
	close ($dict_handle);
}

sub BuildTmpFile 
{
    my ($filename) = @_;
	
	my $tmpfile = $filename;
    $tmpfile 	=~ s/[\.\/]//g;
    $tmpfile 	.= $$ . time;
    
	# Untaint tmpfile variable
    if ($tmpfile =~ /^([-\@\w.]+)$/) 
	{
		$tmpfile = $1;
    }
    
    return "/tmp/$tmpfile"; # Altered by Min (Thu Feb 28 13:08:59 SGT 2008)
}

1;