#
# Copyright 2007 Penn State University
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
package HeaderParse::API::Function;

use utf8;
use HeaderParse::Config::API_Config qw($Database_Dir);
use HeaderParse::API::LoadInformation;
require Exporter;
use Storable qw(nfreeze thaw);
use Data::Dumper;
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
use vars qw(%dictH %nameH %monthH %affiH %addrH %conjH %prepH %postcodeH %cityH %stateH %countryH %abstractH);

@ISA = qw(Exporter); # important!!
@EXPORT =  qw(&weired_author &AddrMatch &printDict &GenTrainVecMatrix &GetBorderLine &LineFeatureRepre &LineFeatureRepre2 &OfflineFillSpace &FillSpace &SeparatePunc &hash_stopwords &hash_nickname &hash_affi_stopwords &hash_addrwords &hash_statewords &str_space_clean &dump_hash_to_file &nfreeze_hash_to_file &read_hash_from_file &thaw_hash_from_file &rand_split_samples_to2parts &rand_split_samples_toNparts &rand_split_hash_index_toNparts &ExtractBinaryNfoldSVMResult &GetNameVariations &get_university_emails &compute_std);

sub AddrMatch() {

  my $inline = shift;

  my @words = split(/\s+/, $inline);
  my $senLen = 0;
  
  # match the state and country here using one or two words
  # this step might be very time consuming
  if ($words[0] !~ /^\W+\s*$/) {
      $senLen ++; # punctuation
  }

  for my $i(1 .. $#words) {
    if ($words[$i] !~ /^\W+\s*$/) {
      $senLen ++; # punctuation
    }
    #the first letter is capitalized
    if (($words[$i-1] =~ /^[\p{IsUpper}]/) && ($words[$i] =~ /^[\p{IsUpper}]/)) {
       my $pre = lc($words[$i-1]);
      
       my $now = lc($words[$i]);
       if (exists $stateH{"$pre $now"}) { # need to check if it is correct
	 $words[$i-1] = "";
	 $words[$i] = ":state:";
       }elsif (exists $countryH{"$pre $now"}) {
	 $words[$i-1] = "";
	 $words[$i] = ":country:";
       }elsif (exists $cityH{"$pre $now"}) {
	 $words[$i-1] = "";
	 $words[$i] = ":city:";
       }
     }
   }
  
   #Broken line is because of the insufficient hard disk
  $inline = "@words"; #nice join!
  $inline =~ s/^\s+//g;
  $inline =~ s/\s+$//g;

  return($inline, $senLen);
}


sub printDict() {
  my ($TotalTrainLineCount, $dictF, %dictH) = @_;
  
  open(DictFH, ">$dictF") || die "SVMHeaderParse: could not open dictfile\: $dictF to write\n";
    # replace the old FeatureDictH with the new IDs
  foreach my $feature (sort{$dictH{$a}{ID} <=> $dictH{$b}{ID}} keys %dictH) {
    if (defined $dictH{$feature}{ID}) {
      $dictH{$feature}{mean} = sprintf("%.8f", $dictH{$feature}{mean}/$TotalTrainLineCount); 
      
      if ($dictH{$feature}{max} == 0) {
	print STDERR "$feature Yahoo1 \n";
      }
      my $ANmean = sprintf("%.8f", $dictH{$feature}{mean}/$dictH{$feature}{max});
      print DictFH "$dictH{$feature}{df} $dictH{$feature}{ID}  $feature\: max\($dictH{$feature}{max}\) BNmean\($dictH{$feature}{mean}\) ANmean\($ANmean\)\n";
    }
  }  
  close(DictFH);

  return (%dictH);
}

sub GenTrainVecMatrix() {
  my ($FeatureDictH, $TrainFeatureVecH, $TrainFeatureVec, $TrainMatrixF, $TrainTagInd, $GenMatrix, $norm, $center) = @_;

   open (TrainFeatureVec, ">$TrainFeatureVec") || die "SVMHeaderParse: could not open TrainFeatureVec $TrainFeatureVec to write\n";
    if ($GenMatrix) {
      open (TrainMatrxFH, ">$TrainMatrixF") || die "SVMHeaderParse: could not open TrainMatrF\: $TrainMatrixF to write\n";
      open (TrainTagIndFH, ">$TrainTagInd") || die "SVMHeaderParse: could not open TrainTagInd\: $TrainTagInd to write\n";
    }
    my $TmpTrainLineNo = 0;
    foreach my $s (sort {$a <=> $b} keys %{$TrainFeatureVecH}) {
      foreach my $li (sort {$a <=> $b} keys %{$$TrainFeatureVecH{$s}}) {
	$TmpTrainLineNo ++;

	#10/17 multi-class
	print TrainFeatureVec "\(";
#	print TrainTagIndFH "";
	foreach my $tmpCurState (keys %{$$TrainFeatureVecH{$s}{$li}{tag}}) {
	  print TrainFeatureVec "$tmpCurState ";
	  print TrainTagIndFH "$tmpCurState ";
	}
	print TrainFeatureVec "\) ";
	print TrainTagIndFH "\n";

	# brute force; insufficient memorty
	if ($GenMatrix == 0) {
	  if ($norm) {
	    if ($center == 1) {
	      #a loop of each feature in the dictionary.
	      foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$FeatureDictH}) {
		if (! exists ($$TrainFeatureVecH{$s}{$li}{content}{$feature})) {
		  $$TrainFeatureVecH{$s}{$li}{content}{$feature} = 0;
		}
		# norm
		my $featureVal = sprintf("%.8f", $$TrainFeatureVecH{$s}{$li}{content}{$feature}/$$FeatureDictH{$feature}{max});
		#centering
		$featureVal -= sprintf("%.8f", $$FeatureDictH{$feature}{mean}/$$FeatureDictH{$feature}{max});
		$$TrainFeatureVecH{$s}{$li}{content}{$feature} = $featureVal;
		if ($$TrainFeatureVecH{$s}{$li}{content}{$feature} != 0) {
		  print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
		}
	      }
	      print TrainFeatureVec "\n";
	    }else {
	      foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$TrainFeatureVecH{$s}{$li}{content}}) {
		if (! defined ($$FeatureDictH{$feature}{ID})) {
		  next;
		}
		my $featureVal = sprintf("%.8f", $$TrainFeatureVecH{$s}{$li}{content}{$feature}/$$FeatureDictH{$feature}{max});
		$$TrainFeatureVecH{$s}{$li}{content}{$feature} = $featureVal;
		if ($$TrainFeatureVecH{$s}{$li}{content}{$feature} != 0) {
		  print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
		}
	      }
	      print TrainFeatureVec "\n";
	    }
	  }else { # norm = 0;
	    foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$TrainFeatureVecH{$s}{$li}{content}}) {
		if (! defined ($$FeatureDictH{$feature}{ID})) {
		  next;
		}
		if ($$TrainFeatureVecH{$s}{$li}{content}{$feature} != 0) { # must be != 0
		  print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
		}
	      }
		print TrainFeatureVec "\n";
	  }
	}else {	  
	  foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$TrainFeatureVecH{$s}{$li}{content}}) {
	    if (! defined ($$FeatureDictH{$feature}{ID})) {
	      next;
	    }
	    if ($norm == 1) {
	      my $featureVal = sprintf("%.8f", $$TrainFeatureVecH{$s}{$li}{content}{$feature}/$$FeatureDictH{$feature}{max});
	      $$TrainFeatureVecH{$s}{$li}{content}{$feature} = $featureVal;
	    }
	    if ($$TrainFeatureVecH{$s}{$li}{content}{$feature}) {
	      print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
	      # generate the matrix file for the training samples \n";
	      print TrainMatrxFH " $TmpTrainLineNo $$FeatureDictH{$feature}{ID} $$TrainFeatureVecH{$s}{$li}{content}{$feature}\n";
	    }
	}
	print TrainFeatureVec "\n";
      }# end if

      }# end foreach l(line)
      print TrainFeatureVec "<NEW_HEADER>\n";
      print TrainTagIndFH "<NEW_HEADER>\n";
    }#end foreach s(sample)
    close (TrainFeatureVec);
    undef (%{$TrainFeatureVecH}); # release the training vector hash
    $endTrain = 0;
    close(TrainTagIndFH);
    if ($GenMatrix) {
      close(TrainMatrxFH);
    }
}

# this is for the plaintext class -- no difference from GenTrainVecMatrix
sub GenOriginalTrainVecMatrix() {
  my ($FeatureDictH, $TrainFeatureVecH, $TrainFeatureVec, $TrainMatrixF, $TrainTagInd, $GenMatrix, $norm, $center) = @_;


   open (TrainFeatureVec, ">$TrainFeatureVec") || die "SVMHeaderParse: here1...could not open TrainFeatureVec $TrainFeatureVec to write\n";
    if ($GenMatrix) {
      open (TrainMatrxFH, ">$TrainMatrixF") || die "SVMHeaderParse: here2...could not open TrainMatrF\: $TrainMatrixF to write\n";
      open (TrainTagIndFH, ">$TrainTagInd") || die "SVMHeaderParse: here3...could not open TrainTagInd\: $TrainTagInd to write\n";
    }
    my $TmpTrainLineNo = 0;
    foreach my $s (sort {$a <=> $b} keys %{$TrainFeatureVecH}) {
      foreach my $li (sort {$a <=> $b} keys %{$$TrainFeatureVecH{$s}}) {
	$TmpTrainLineNo ++;

	#10/17 multi-class
	print TrainFeatureVec "\(";
#	print TrainTagIndFH "";
	foreach my $tmpCurState (keys %{$$TrainFeatureVecH{$s}{$li}{tag}}) {
	  print TrainFeatureVec "$tmpCurState ";
	  print TrainTagIndFH "$tmpCurState ";
	}
	print TrainFeatureVec "\) ";
	print TrainTagIndFH "\n";

	#

	if (0) {
	  print TrainFeatureVec "$$TrainFeatureVecH{$s}{$li}{tag} ";
	  print TrainTagIndFH "$$TrainFeatureVecH{$s}{$li}{tag}\n";
	}

	# brute force; insufficient memorty
	if (($GenMatrix == 0) && ($norm == 1) && ($center == 1)) {
	  #a loop of each feature in the dictionary.
	  foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$FeatureDictH}) {
	    if (! exists ($$TrainFeatureVecH{$s}{$li}{content}{$feature})) {
	      $$TrainFeatureVecH{$s}{$li}{content}{$feature} = 0;
	    }
	    # norm
	    my $featureVal = sprintf("%.8f", $$TrainFeatureVecH{$s}{$li}{content}{$feature}/$$FeatureDictH{$feature}{max});
	    #centering
	    $featureVal -= sprintf("%.8f", $$FeatureDictH{$feature}{mean}/$$FeatureDictH{$feature}{max});
	    $$TrainFeatureVecH{$s}{$li}{content}{$feature} = $featureVal;
	    if ($$TrainFeatureVecH{$s}{$li}{content}{$feature} != 0) {
	      print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
	    }
	  }
	  print TrainFeatureVec "\n";
	}elsif (($GenMatrix == 0) && ($norm == 1)) {
	  foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$TrainFeatureVecH{$s}{$li}{content}}) {
	    if (! defined ($$FeatureDictH{$feature}{ID})) {
	      next;
	    }
	    my $featureVal = sprintf("%.8f", $$TrainFeatureVecH{$s}{$li}{content}{$feature}/$$FeatureDictH{$feature}{max});
	    $$TrainFeatureVecH{$s}{$li}{content}{$feature} = $featureVal;
	    if ($$TrainFeatureVecH{$s}{$li}{content}{$feature} != 0) {
	      print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
	    }
	  }
	  print TrainFeatureVec "\n";
	}elsif($GenMatrix == 1) {	  
	  foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %{$$TrainFeatureVecH{$s}{$li}{content}}) {
	    if (! defined ($$FeatureDictH{$feature}{ID})) {
	      next;
	    }
	    if ($norm == 1) {
	      my $featureVal = sprintf("%.8f", $$TrainFeatureVecH{$s}{$li}{content}{$feature}/$$FeatureDictH{$feature}{max});
	      $$TrainFeatureVecH{$s}{$li}{content}{$feature} = $featureVal;
	    }
	    if ($$TrainFeatureVecH{$s}{$li}{content}{$feature} != 0) {
	      print TrainFeatureVec "$$FeatureDictH{$feature}{ID}\:$$TrainFeatureVecH{$s}{$li}{content}{$feature} ";
	      # generate the matrix file for the training samples \n";
	      print TrainMatrxFH " $TmpTrainLineNo $$FeatureDictH{$feature}{ID} $$TrainFeatureVecH{$s}{$li}{content}{$feature}\n";
	    }
	}
	print TrainFeatureVec "\n";
      }# end if

      }# end foreach l(line)
      print TrainFeatureVec "<NEW_HEADER>\n";
      print TrainTagIndFH "<NEW_HEADER>\n";
    }#end foreach s(sample)
    close (TrainFeatureVec);
    undef (%{$TrainFeatureVecH}); # release the training vector hash
    $endTrain = 0;
    close(TrainTagIndFH);
    if ($GenMatrix) {
      close(TrainMatrxFH);
    }
}


sub GetBorderLine() {
  my $InFile = shift; # this file contains the sample separator
  my %BorderLineH;
  my $LineNO = 0;

  open(INFH, "$InFile") || die "SVMHeaderParse: could not open Infile\: $InFile to read \n";
  while (my $li =<INFH>) {
    $li =~ s/^\s+//g;
    $li =~ s/\s+$//g;
    if ($li !~ /^\s*$/) {
      $LineNO++;
      if ($LineNO == 1) {
        $BorderLineH{$LineNO} = "N"; #only has next line
      }elsif ($li =~ /^\<NEW\_HEADER\>/) {
        $BorderLineH{$LineNO-1} = "P";
        $BorderLineH{$LineNO+1} = "N";
      }
    }
  }
  close(INFH);
  delete($BorderLineH{$LineNO+1}); # delete the last line

  return(\%BorderLineH);
}

#all relevant domain databases are imported as shown at the beginning of this 
#program
#useful for OfflineSeparateMultiClassLine.pl esp. for printing
sub LineFeatureRepre2() {
  my $label = shift;
  my $line = shift;
  my $FeatureDictH = shift; 
  my $FiletoPrint = shift;

  my $neutral = 1;
  my $neutralAddName = 0;
  my $norm = 1;

  my %TestFeatureVecH = (); #very important
  
  #some of these features might not work for single word case such as 
  #senLen, so might just take this factor out for word case
  #########categorical features################
  my $senLen = 0;
  my $dateNum = 0;
  my $DictWordNum = 0;
  my $NonDictWordNum = 0;
  my $Cap1DictWordNum = 0;
  my $Cap1NonDictWordNum = 0;
  my $digitNum = 0;
  my $others = 0;
  my $affiNum = 0;	
  my $addrNum = 0; # let city, state, country all counted as the addr
                   # for word case, we might need more specific recognition
  my $capNum = 0;
  my $introNum = 0;
  my $phoneNum = 0;
  my $degreeNum = 0;
  my $pubNum = 0;
  my $noteNum = 0;
  my $pageNum = 0;
  ###

  my $TokenLine;			 				 
  if (length($line) > 1) {
    ($TokenLine, $senLen) = &AddrMatch($line); # this is to match the bi-grams in the address database; assume bi-gram is unique for address
    #transformed features				  
  }else {
    $TokenLine = $line;
  }
  
  my @words = split(/\s+/, $TokenLine);
  #now start the AddrNameConfu, shared among address and people's name 
  #normally do not use this representation

  for my $i(0 .. $#words) {
    if ($words[$i] =~ /\+PAGE\+/) {
      $words[$i] = ":page:";
      $pageNum++;
    }
  } # end with for each word
  
  #match bi-gram on Pubnum, Note and Degree and affiliation (might make it a separate func)					       
  if (($neutral) && (length($line) > 1)) {
    for my $i(1 .. $#words) {
      my $pre = lc($words[$i-1]);
      my $now = lc($words[$i]);
      my $prestem;
      my $nowstem;
      my $degreeMatch;
      my $pubnumMatch;
      my $noteMatch;
      my $affiMatch;

      if ($stem) {
	$prestem = &PSTEM::stem($pre);
	$nowstem = &PSTEM::stem($now);
	$degreeMatch = $degreeH{lc("$prestem $nowstem")};
	$pubnumMatch = $pubnumH{lc("$prestem $nowstem")};
	$noteMatch = $noteH{lc("$prestem $nowstem")};
	$affiMatch = $affiH{lc("$prestem $nowstem")};
      }else { # for bigram match, we do not request both to be capitalized
	$degreeMatch = $degreeH{lc("$pre $now")};
	$pubnumMatch = $pubnumH{lc("$pre $now")};
	$noteMatch = $noteH{lc("$pre $now")};
	$affiMatch = $affiH{lc("$pre $now")};
      }
      
      
      if (($pre =~ /^\s*$/) || ($pre =~ /\:\w+\:/)) {next; }
      
      my %Confuse4BiGram = (
			    1 => 0,
			    2 => 0,
			    3 => 0,
			    4 => 0
			   );
      my $match = 0;
      if ($degreeMatch) {
	$Confuse4BiGram{1} = 1;
	$match = 1;
      }
      if ($pubnumMatch) {
	$Confuse4BiGram{2} = 1;
	$match = 1;
      }
      if ($noteMatch) {
	$Confuse4BiGram{3} = 1;
	$match = 1;
      }
      
      if ($affiMatch) {
	$Confuse4BiGram{4} = 1;
	$match = 1;
      }
      
      if ($match == 0) { next; }
      
      $words[$i] = "\:Confuse4BiGram";
      foreach my $ind(sort {$a <=> $b} keys %Confuse4BiGram) {
	$words[$i] .= "$Confuse4BiGram{$ind}";
      }
      $words[$i] .= "\:";
      
      if ($words[$i] eq "\:Confuse4BiGram1000\:") {
	$words[$i-1] = "";
	$words[$i] = ":degree:";
	$degreeNum++;
      }elsif ($words[$i] eq "\:Confuse4BiGram0100\:") {
	$words[$i-1] = "";
	$words[$i] = ":pubnum:";
	$pubNum++;
      }elsif ($words[$i] eq "\:Confuse4BiGram0010\:") {
	$words[$i-1] = "";
	$words[$i] = ":note:";
	$noteNum++;
      }elsif ($words[$i] eq "\:Confuse4BiGram0001\:") {
	$words[$i-1] = "";
	$words[$i] = ":affi:";
	$affiNum++;
      }
    }
  }#end with neutral bigram
  
  # single words match on Pubnum, notes and degree!
  for my $i(0 .. $#words) {
    if (($words[$i] !~ /\:\w+\:/) && ($words[$i] !~ /^\W+\s*$/)) {
      if ($neutral) {
	my %Confuse4Single = (
			      1 => 0,
			      2 => 0,
			      3 => 0,
			      4 => 0
			     );
	my $match = 0;
	my $degreeMatch;
	my $pubnumMatch;
	my $noteMatch;
	my $affiMatch;
	my $stemword;
	
	if ($stem) {
	  $stemword = &PSTEM::stem($stemword);
	  $degreeMatch = $degreeH{$stemword};
	  $pubnumMatch = $pubnumH{$stemword};
	  $noteMatch = $noteH{$stemword};
	  $affiMatch = ($words[$i] =~ /^[\p{IsUpper}]/ && $affiH{$stemword});
	}else {
	  $degreeMatch = $degreeH{lc($words[$i])};
	  $pubnumMatch = $pubnumH{lc($words[$i])};
	  $noteMatch = $noteH{lc($words[$i])};
	  $affiMatch = ($words[$i] =~ /^[\p{IsUpper}]/ && $affiH{lc($words[$i])});
	}
	
	#because hhan@cse.psu.edu will become hhan.psu.edu after stemming
	#and $stemword is lower case
	if ($degreeMatch) {
	  $Confuse4Single{1} = 1;
	  $match = 1;
	}
	if ($pubnumMatch) {
	  $Confuse4Single{2} = 1;
	  $match = 1;
	}
	if ($noteMatch) {
	  $Confuse4Single{3} = 1;
	  $match = 1;
	}
	if ($affiMatch) {
	  $Confuse4Single{4} = 1;
	  $match = 1;
	}
	
	if ($match) {
	  $words[$i] = "\:Confuse4Single";
	  foreach my $ind(sort {$a <=> $b} keys %Confuse4Single) {
	    $words[$i] .= "$Confuse4Single{$ind}";
	  }
	  $words[$i] .= "\:";
	  if ($words[$i] eq "\:Confuse4Single1000\:") {
	    $words[$i] = ":degree:";
	    $degreeNum++;
	  }elsif ($words[$i] eq "\:Confuse4Single0100\:") {
	    $words[$i] = ":pubnum:";
	    $pubNum++;
	  }elsif ($words[$i] eq "\:Confuse4Single0010\:") {
	    $words[$i] = ":note:";
	    $noteNum++;
	  }elsif ($words[$i] eq "\:Confuse4Single0001\:") {
	    $words[$i] = ":affi:";
	    $affiNum++;
	  }
	}
      }# end with neutral

      if ($words[$i] !~ /\:\w+\:/) {
	if (exists($conjH{$words[$i]})) {
	  $words[$i] = ":conj:";
	}elsif (exists($prepH{$words[$i]})) {
	  $words[$i] = ":prep:";
	}elsif ($words[$i] =~ /\@/) {
	  $words[$i] = "\:Email\:";
	}elsif ($words[$i] =~ /(http)|(ftp)\:\/\/(\w+\.){1,}/i) {
	  $words[$i] = "\:http\:";
	}elsif ($words[$i] =~ /^[\p{IsUpper}]/) { # Capitalize letter 1
	  if ((length($words[$i]) == 1) || ($words[$i] =~ /^[\p{IsUpper}]\.$/)) {
	    $words[$i] = ":SingleCap:"; #like M
	    $capNum ++; # actually only the number of single cap
	  }elsif (exists ($postcodeH{lc($words[$i])})) { # 2 caps
	    $words[$i] = ":postcode:";
	  }elsif (($i == 0) && ($abstractH{lc($words[$i])})) {
	    $words[$i] = ":abstract:";
	  }elsif (($i == 0) && ($keywordH{lc($words[$i])})) {
	    $words[$i] = ":keyword:";
	  }elsif ($introH{lc($words[$i])}) {
	    $words[$i] = ":intro:";
	    $introNum++;
	  }elsif ($phoneH{lc($words[$i])}) {
	    $words[$i] = ":phone:";
	    $phoneNum++;
	  }elsif ($monthH{lc($words[$i])}) {
	    $words[$i] = ":month:";
	    $dateNum++;
	  }else {
	    if ($neutral) {
	      if ($addrH{lc($words[$i])}) {
		$words[$i] = ":addr:";
		$addrNum++;
	      }elsif ($cityH{lc($words[$i])}) { #If not neutral class
		$words[$i] = ":city:";
		$addrNum++;
	      }elsif ($stateH{lc($words[$i])}) {
		$words[$i] = ":state:";
		$addrNum++;
	      }elsif ($countryH{lc($words[$i])}) {
		$words[$i] = ":country:";
		$addrNum++;
	      }elsif ($nameH{lc($words[$i])}) { # end with not neutral class
		$words[$i] = ":MayName:";
		$Cap1NonDictWordNum ++;
	      }elsif ($dictH{lc($words[$i])}) {
		$words[$i] = ":Cap1DictWord:";
		$Cap1DictWordNum ++;		
	      }elsif ($words[$i] =~ /\W+|\-/) { #like BU-CS-93-015; maybe the length could be relaxed; I add \W+ here!!!
		my @Parts = split(/\W+|\-/, $words[$i]);
		for $i(0 .. $#Parts) {
		  if ($Parts[$i] =~ /^[\p{IsLower}]+$/) {
		    my $len = length($Parts[$i]);
		    #	      $Parts[$i] = "\:LowerWord"."$len"."\:";
		    $Parts[$i] = "\:LowerWords\:";
		  }elsif ($Parts[$i] =~ /^[\p{IsUpper}]+$/) {
		    my $len = length($Parts[$i]);
		    #	      $Parts[$i] = "\:CapWord"."$len"."\:";
		    $Parts[$i] = "\:CapWords\:";
		  }elsif ($Parts[$i] =~ /^\d+$/) {
		    my $len = length($Parts[$i]);
		    #	      $Parts[$i] = "\:Dig\[$len\]\:";
		    $Parts[$i] = "\:Digs\:";
		  }elsif ($Parts[$i] =~ /^[\p{IsUpper}\p{IsLower}]+$/) {
		    $Parts[$i] = "\:MixCaseWords\:";
		  }else {
		    my $len = length($Parts[$i]);
		    $Parts[$i] = "\:Mix\[$len\]\:";
		  }
		}
		$words[$i] = join("\-", @Parts);
	      }elsif ($words[$i] =~ /^[\p{IsUpper}]+$/) {
		my $len = length($words[$i]);
		$words[$i] = "\:CapWord"."$len"."\:";
		#	      $words[$i] = "\:CapWords\:";
	      }else {
		$words[$i] = ":Cap1NonDictWord:";
		$Cap1NonDictWordNum ++;
	      }
	    }else {
	      if ($degreeH{lc($words[$i])}) {
		$words[$i] = ":degree:";
		$degreeNum++;
	      }elsif ($pubnumH{lc($words[$i])}) {
		$words[$i] = ":pubnum:";
		$pubNum++;
	      }elsif ($noteH{lc($words[$i])}) {
		$words[$i] = ":note:";
		$noteNum++;
	      }elsif ($monthH{lc($words[$i])}) {
		$words[$i] = ":month:";
		$dateNum++;
	      }elsif ($affiH{lc($words[$i])}) {
		$words[$i] = ":affi:";
		$affiNum++;
	      }elsif ($addrH{lc($words[$i])}) {
		$words[$i] = ":addr:";
		$addrNum++;
	      }elsif ($cityH{lc($words[$i])}) { #If not neutral class
		$words[$i] = ":city:";
		#	    $words[$i] = ":addr:";
		$addrNum++;
	      }elsif ($stateH{lc($words[$i])}) {
		$words[$i] = ":state:";
		#	    $words[$i] = ":addr:";
		$addrNum++;
	      }elsif ($countryH{lc($words[$i])}) {
		$words[$i] = ":country:";
		#	    $words[$i] = ":addr:";
		$addrNum++;
	      }elsif ($nameH{lc($words[$i])}) { # end with not neutral class
		$words[$i] = ":MayName:";
		$Cap1NonDictWordNum ++;
	      }elsif ( $dictH{lc($words[$i])}) {
		$words[$i] = ":Cap1DictWord:";
		$Cap1DictWordNum ++;		
	      }elsif ($words[$i] =~ /\W+|\-/) { #like BU-CS-93-015; maybe the length could be relaxed; I add \W+ here!!!
		my @Parts = split(/\W+|\-/, $words[$i]);
		for $i(0 .. $#Parts) {
		  if ($Parts[$i] =~ /^[\p{IsLower}]+$/) {
		    my $len = length($Parts[$i]);
		    #	      $Parts[$i] = "\:LowerWord"."$len"."\:";
		    $Parts[$i] = "\:LowerWords\:";
		  }elsif ($Parts[$i] =~ /^[\p{IsUpper}]+$/) {
		    my $len = length($Parts[$i]);
		    #	      $Parts[$i] = "\:CapWord"."$len"."\:";
		    $Parts[$i] = "\:CapWords\:";
		  }elsif ($Parts[$i] =~ /^\d+$/) {
		    my $len = length($Parts[$i]);
		    #	      $Parts[$i] = "\:Dig\[$len\]\:";
		    $Parts[$i] = "\:Digs\:";
		  }elsif ($Parts[$i] =~ /^[\p{IsUpper}\p{IsLower}]+$/) {
		    $Parts[$i] = "\:MixCaseWords\:";
		  }else {
		    my $len = length($Parts[$i]);
		    $Parts[$i] = "\:Mix\[$len\]\:";
		  }
		}
		$words[$i] = join("\-", @Parts);
	      }elsif ($words[$i] =~ /^[\p{IsUpper}]+$/) {
		my $len = length($words[$i]);
		$words[$i] = "\:CapWord"."$len"."\:";
		#	      $words[$i] = "\:CapWords\:";
	      }else {
		$words[$i] = ":Cap1NonDictWord:";
		$Cap1NonDictWordNum ++;
	      }
	    }
	  }#end with neutral
	}elsif ($words[$i] =~ /^[\p{IsLower}]/) { # small case letter 1
	  if (exists ($phoneH{$words[$i]})) {
	    $words[$i] = ":phone:";
	    $phoneNum++;
          }elsif (exists ($monthH{lc($words[$i])})) {
	    $words[$i] = ":month:";
	    $dateNum++;
	  }elsif ($keywordH{lc($words[$i])}) {
	    $words[$i] = ":keyword:";
	  }elsif (exists $dictH{lc($words[$i])}) { 
	    $words[$i] = ":DictWord:";
	    $DictWordNum ++;		
	  }else {# should consider the mixure of digit and letters
	    $words[$i] = ":NonDictWord:";
	    $NonDictWordNum ++;
	  } 
	}elsif ($words[$i] =~ /^[\d\-]+$/) { #like 30332-0280 or 1111
	  my $newword = $words[$i];
	  while ($words[$i] =~ /(\d+)/g) {
	    my $dig = $1;
	    my $diglen = length($dig);
	    $newword =~ s/$dig/ \:Dig\[$diglen\]\: /;
	  }
	  $words[$i] = $newword;
	  $digitNum++;
	}elsif ($words[$i] =~ /^(\W+)(.*)$/) { #start from a non-word character
	  my $nonword = $1;
	  my $rest = $2;
	  $words[$i] = $nonword;
	  while (length($rest) > 0) {
	    if ($rest =~ /^([\p{IsUpper}]+)(.*)$/) {
	      my $tmp = $1;
	      $rest = $2;
	      $words[$i] .= "\:CapWords\:".length($tmp); #length may be relaxed
	    }elsif ($rest =~ /^([\p{IsLower}]+)(.*)$/) {
	      my $tmp = $1;
	      $rest = $2;
	      $words[$i] .= "\:LowerWords\:".length($tmp);
	    }elsif ($rest =~ /^(\d+)(.*)$/) {
	      my $tmp = $1;
	      $rest = $2;
	      $words[$i] .= "\:Digs\:".length($tmp);
	    }else { #get the head character
	      my $restLen = length($rest);
	      $restLen--;
	      $words[$i] .= substr($rest, 0, 1);
	      $rest = substr($rest, 1, $restLen);
	    }
	  }
	}else {
	  $others++;
	}
      }
    }else {
      #      print " already token or punctuation\: $words[$i] \n";
    }
  }
  
  for my $i(0 .. $#words) {
      if (exists ($$FeatureDictH{$words[$i]}{ID})) {
	  $TestFeatureVecH{$words[$i]}++;
      }
  }

  # here we add in the bigrams
  if (length($line) > 1) {
    for my $i(1 .. $#words) { #not good for (0 .. $#words-1) soemtimes
      my $pre = $words[$i-1];
      my $now = $words[$i];
      # add bigram into dict and train or test vector
      if (exists ($$FeatureDictH{"$pre $now"}{ID})) {
	$TestFeatureVecH{"$pre $now"}++;
      }
    } # end with bigram features
  }
  
  # try to normalize using F1
  $TestFeatureVecH{CsenLen} = $senLen;
  if ($senLen > 0) {
    $TestFeatureVecH{CdateNumPer} =  sprintf("%.8f", $dateNum/$senLen);
    $TestFeatureVecH{CDictWordNumPer} = sprintf("%.8f", $DictWordNum/$senLen);
    $TestFeatureVecH{CNonDictWordNumPer} =  sprintf("%.8f", $NonDictWordNum/$senLen);
    $TestFeatureVecH{CCap1DictWordNumPer} = sprintf("%.8f", $Cap1DictWordNum/$senLen);
    $TestFeatureVecH{CCap1NonDictWordNumPer} = sprintf("%.8f", $Cap1NonDictWordNum/$senLen);
    $TestFeatureVecH{CdigitNumPer} = sprintf("%.8f", $digitNum/$senLen);
    $TestFeatureVecH{CaffiNumPer} = sprintf("%.8f", $affiNum/$senLen);
    $TestFeatureVecH{CaddrNumPer} = sprintf("%.8f", $addrNum/$senLen);
    $TestFeatureVecH{CintroNumPer} = sprintf("%.8f",$introNum/$senLen);
    $TestFeatureVecH{CphoneNumPer} = sprintf("%.8f",$phoneNum/$senLen);
    $TestFeatureVecH{CdegreeNumPer} = sprintf("%.8f",$degreeNum/$senLen);
    $TestFeatureVecH{CpubNumPer} = sprintf("%.8f",$pubNum/$senLen);
    $TestFeatureVecH{CnoteNumPer} = sprintf("%.8f",$noteNum/$senLen);
    $TestFeatureVecH{CpageNumPer} = sprintf("%.8f",$pageNum/$senLen);
    $TestFeatureVecH{CcapNumPer} = sprintf("%.8f",$capNum/$senLen);
    $TestFeatureVecH{CothersPer} =  sprintf("%.8f", $others/$senLen);
    #$TestFeatureVecH{ClinePos} = sprintf("%.8f", $linePos);
  }else {
    #print "null line\: $line \n";
  }
 
  if ($FiletoPrint ne "") { 
      open(PFH, ">$FiletoPrint") || die "SVMHeaderParse: here4...could not open $FiletoPrint to write\n";
      print PFH "$label ";
  }

  my $SVMFeaVec = "$label "; #this is a string 
  foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %TestFeatureVecH) {
  if ($TestFeatureVecH{$feature} != 0){
     if ($norm) {
	if ($$FeatureDictH{$feature}{max} != 0) {
	# print "feature: $TestFeatureVecH{$feature} ; dict $$FeatureDictH{$feature}{max} => ";
	  my $tmpval =  sprintf("%.8f", $TestFeatureVecH{$feature}/$$FeatureDictH{$feature}{max});
	  $TestFeatureVecH{$feature} = $tmpval;
	  #print "  $TestFeatureVecH{$feature} \n";
	}else {
	  #print "zero max\: $feature \n";
	}
      }
      if ($FiletoPrint ne "") {
      	print PFH "$$FeatureDictH{$feature}{ID}\:$TestFeatureVecH{$feature} ";
      }
      $SVMFeaVec .= "$$FeatureDictH{$feature}{ID}\:$TestFeatureVecH{$feature} ";
   }else {
     #print "zero value\: $feature ($TestFeatureVecH{$feature}) \n";
   }
  }
  if ($FiletoPrint ne "") {
      print PFH "\n";
      close(PFH); 
  }

  my $convertedStr = join(" ", @words);
  #return(\%TestFeatureVecH);
   return($SVMFeaVec);
  #return($convertedStr);
}


sub LineFeatureRepre() {
  my $line = shift;
  my $neutral = 1;
  my $neutralAddName = 0;
  my $norm = 1;
  my %TestFeatureVecH = (); #very important
  
  #some of these features might not work for single word case such as 
  #senLen, so might just take this factor out for word case
  #########categorical features################
  my $senLen = 0;
  my $dateNum = 0;
  my $DictWordNum = 0;
  my $NonDictWordNum = 0;
  my $Cap1DictWordNum = 0;
  my $Cap1NonDictWordNum = 0;
  my $digitNum = 0;
  my $others = 0;
  my $affiNum = 0;	
  my $addrNum = 0; # let city, state, country all counted as the addr
                   # for word case, we might need more specific recognition
  my $capNum = 0;
  my $introNum = 0;
  my $phoneNum = 0;
  my $degreeNum = 0;
  my $pubNum = 0;
  my $noteNum = 0;
  my $pageNum = 0;
  ###

  my $TokenLine;			 				 
  if (length($line) > 1) {
    ($TokenLine, $senLen) = &AddrMatch($line); # this is to match the bi-grams in the address database; assume bi-gram is unique for address
    #transformed features				  
  }else {
    $TokenLine = $line;
  }
  my @words = split(/\s+/, $TokenLine);
  #now start the AddrNameConfu, shared among address and people's name 
  #normally do not use this representation

  for my $i(0 .. $#words) {
    if ($words[$i] =~ /\+PAGE\+/) {
      $words[$i] = ":page:";
      $pageNum++;
    }
  } # end with for each word
  
  #match bi-gram on Pubnum, Note and Degree and affiliation (might make it a separate func)					       
  if (($neutral) && (length($line) > 1)) {
    for my $i(1 .. $#words) {
      my $pre = lc($words[$i-1]);
      my $now = lc($words[$i]);
      my $prestem;
      my $nowstem;
      my $degreeMatch;
      my $pubnumMatch;
      my $noteMatch;
      my $affiMatch;

      if ($stem) {
	$prestem = &PSTEM::stem($pre);
	$nowstem = &PSTEM::stem($now);
	$degreeMatch = $degreeH{lc("$prestem $nowstem")};
	$pubnumMatch = $pubnumH{lc("$prestem $nowstem")};
	$noteMatch = $noteH{lc("$prestem $nowstem")};
	$affiMatch = $affiH{lc("$prestem $nowstem")};
      }else { # for bigram match, we do not request both to be capitalized
	$degreeMatch = $degreeH{lc("$pre $now")};
	$pubnumMatch = $pubnumH{lc("$pre $now")};
	$noteMatch = $noteH{lc("$pre $now")};
	$affiMatch = $affiH{lc("$pre $now")};
      }
      
      
      if (($pre =~ /^\s*$/) || ($pre =~ /\:\w+\:/)) {next; }
      
      my %Confuse4BiGram = (
			    1 => 0,
			    2 => 0,
			    3 => 0,
			    4 => 0
			   );
      my $match = 0;
      if ($degreeMatch) {
	$Confuse4BiGram{1} = 1;
	$match = 1;
      }
      if ($pubnumMatch) {
	$Confuse4BiGram{2} = 1;
	$match = 1;
      }
      if ($noteMatch) {
	$Confuse4BiGram{3} = 1;
	$match = 1;
      }
      
      if ($affiMatch) {
	$Confuse4BiGram{4} = 1;
	$match = 1;
      }
      
      if ($match == 0) { next; }
      
      $words[$i] = "\:Confuse4BiGram";
      foreach my $ind(sort {$a <=> $b} keys %Confuse4BiGram) {
	$words[$i] .= "$Confuse4BiGram{$ind}";
      }
      $words[$i] .= "\:";
      
      if ($words[$i] eq "\:Confuse4BiGram1000\:") {
	$words[$i-1] = "";
	$words[$i] = ":degree:";
	$degreeNum++;
      }elsif ($words[$i] eq "\:Confuse4BiGram0100\:") {
	$words[$i-1] = "";
	$words[$i] = ":pubnum:";
	$pubNum++;
      }elsif ($words[$i] eq "\:Confuse4BiGram0010\:") {
	$words[$i-1] = "";
	$words[$i] = ":note:";
	$noteNum++;
      }elsif ($words[$i] eq "\:Confuse4BiGram0001\:") {
	$words[$i-1] = "";
	$words[$i] = ":affi:";
	$affiNum++;
      }
    }
  }#end with neutral bigram
  
  # single words match on Pubnum, notes and degree!
  for my $i(0 .. $#words) {
    if (($words[$i] !~ /\:\w+\:/) && ($words[$i] !~ /^\W+\s*$/)) {
      if ($neutral) {
	my %Confuse4Single = (
			      1 => 0,
			      2 => 0,
			      3 => 0,
			      4 => 0
			     );
	my $match = 0;
	my $degreeMatch;
	my $pubnumMatch;
	my $noteMatch;
	my $affiMatch;
	my $stemword;
	
	if ($stem) {
	  $stemword = &PSTEM::stem($stemword);
	  $degreeMatch = $degreeH{$stemword};
	  $pubnumMatch = $pubnumH{$stemword};
	  $noteMatch = $noteH{$stemword};
	  $affiMatch = ($words[$i] =~ /^[\p{IsUpper}]/ && $affiH{$stemword});
	}else {
	  $degreeMatch = $degreeH{lc($words[$i])};
	  $pubnumMatch = $pubnumH{lc($words[$i])};
	  $noteMatch = $noteH{lc($words[$i])};
	  $affiMatch = ($words[$i] =~ /^[\p{IsUpper}]/ && $affiH{lc($words[$i])});
	}
	
	#because hhan@cse.psu.edu will become hhan.psu.edu after stemming
	#and $stemword is lower case
	if ($degreeMatch) {
	  $Confuse4Single{1} = 1;
	  $match = 1;
	}
	if ($pubnumMatch) {
	  $Confuse4Single{2} = 1;
	  $match = 1;
	}
	if ($noteMatch) {
	  $Confuse4Single{3} = 1;
	  $match = 1;
	}
	if ($affiMatch) {
	  $Confuse4Single{4} = 1;
	  $match = 1;
	}
	
	if ($match) {
	  $words[$i] = "\:Confuse4Single";
	  foreach my $ind(sort {$a <=> $b} keys %Confuse4Single) {
	    $words[$i] .= "$Confuse4Single{$ind}";
	  }
	  $words[$i] .= "\:";
	  if ($words[$i] eq "\:Confuse4Single1000\:") {
	    $words[$i] = ":degree:";
	    $degreeNum++;
	  }elsif ($words[$i] eq "\:Confuse4Single0100\:") {
	    $words[$i] = ":pubnum:";
	    $pubNum++;
	  }elsif ($words[$i] eq "\:Confuse4Single0010\:") {
	    $words[$i] = ":note:";
	    $noteNum++;
	  }elsif ($words[$i] eq "\:Confuse4Single0001\:") {
	    $words[$i] = ":affi:";
	    $affiNum++;
	  }
	}
      }# end with neutral

      if ($words[$i] !~ /\:\w+\:/) {
	if (exists($conjH{$words[$i]})) {
	  $words[$i] = ":conj:";
	}elsif (exists($prepH{$words[$i]})) {
	  $words[$i] = ":prep:";
	}elsif ($words[$i] =~ /\@/) {
	  $words[$i] = "\:Email\:";
	}elsif ($words[$i] =~ /(http)|(ftp)\:\/\/(\w+\.){1,}/i) {
	  $words[$i] = "\:http\:";
	}elsif ($words[$i] =~ /^[\p{IsUpper}]/) { # Capitalize letter 1
	  if ((length($words[$i]) == 1) || ($words[$i] =~ /^[\p{IsUpper}]\.$/)) {
	    $words[$i] = ":SingleCap:"; #like M
	    $capNum ++; # actually only the number of single cap
	  }elsif (exists ($postcodeH{lc($words[$i])})) { # 2 caps
	    $words[$i] = ":postcode:";
	  }elsif (($i == 0) && ($abstractH{lc($words[$i])})) {
	    $words[$i] = ":abstract:";
	  }elsif (($i == 0) && ($keywordH{lc($words[$i])})) {
	    $words[$i] = ":keyword:";
	  }elsif ($introH{lc($words[$i])}) {
	    $words[$i] = ":intro:";
	    $introNum++;
	  }elsif ($phoneH{lc($words[$i])}) {
	    $words[$i] = ":phone:";
	    $phoneNum++;
	  }elsif ($monthH{lc($words[$i])}) {
	    $words[$i] = ":month:";
	    $dateNum++;
	  }else {
	    if ($neutral) {
	      if ($addrH{lc($words[$i])}) {
		$words[$i] = ":addr:";
		$addrNum++;
	      }elsif ($cityH{lc($words[$i])}) { #If not neutral class
		$words[$i] = ":city:";
		$addrNum++;
	      }elsif ($stateH{lc($words[$i])}) {
		$words[$i] = ":state:";
		$addrNum++;
	      }elsif ($countryH{lc($words[$i])}) {
		$words[$i] = ":country:";
		$addrNum++;
	      }elsif ($nameH{lc($words[$i])}) { # end with not neutral class
		$words[$i] = ":MayName:";
		$Cap1NonDictWordNum ++;
	      }elsif ($dictH{lc($words[$i])}) {
		$words[$i] = ":Cap1DictWord:";
		$Cap1DictWordNum ++;		
	      }elsif ($words[$i] =~ /\W+|\-/) { #like BU-CS-93-015; maybe the length could be relaxed; I add \W+ here!!!
		my @Parts = split(/\W+|\-/, $words[$i]);
		for $i(0 .. $#Parts) {
		  if ($Parts[$i] =~ /^[\p{IsLower}]+$/) {
		    my $len = length($Parts[$i]);
		    #	      $Parts[$i] = "\:LowerWord"."$len"."\:";
		    $Parts[$i] = "\:LowerWords\:";
		  }elsif ($Parts[$i] =~ /^[\p{IsUpper}]+$/) {
		    my $len = length($Parts[$i]);
		    #	      $Parts[$i] = "\:CapWord"."$len"."\:";
		    $Parts[$i] = "\:CapWords\:";
		  }elsif ($Parts[$i] =~ /^\d+$/) {
		    my $len = length($Parts[$i]);
		    #	      $Parts[$i] = "\:Dig\[$len\]\:";
		    $Parts[$i] = "\:Digs\:";
		  }elsif ($Parts[$i] =~ /^[\p{IsUpper}\p{IsLower}]+$/) {
		    $Parts[$i] = "\:MixCaseWords\:";
		  }else {
		    my $len = length($Parts[$i]);
		    $Parts[$i] = "\:Mix\[$len\]\:";
		  }
		}
		$words[$i] = join("\-", @Parts);
	      }elsif ($words[$i] =~ /^[\p{IsUpper}]+$/) {
		my $len = length($words[$i]);
		$words[$i] = "\:CapWord"."$len"."\:";
		#	      $words[$i] = "\:CapWords\:";
	      }else {
		$words[$i] = ":Cap1NonDictWord:";
		$Cap1NonDictWordNum ++;
	      }
	    }else {#end with neutral

	      if ($degreeH{lc($words[$i])}) {
		$words[$i] = ":degree:";
		$degreeNum++;
	      }elsif ($pubnumH{lc($words[$i])}) {
		$words[$i] = ":pubnum:";
		$pubNum++;
	      }elsif ($noteH{lc($words[$i])}) {
		$words[$i] = ":note:";
		$noteNum++;
	      }elsif ($monthH{lc($words[$i])}) {
		$words[$i] = ":month:";
		$dateNum++;
	      }elsif ($affiH{lc($words[$i])}) {
		$words[$i] = ":affi:";
		$affiNum++;
	      }elsif ($addrH{lc($words[$i])}) {
		$words[$i] = ":addr:";
		$addrNum++;
	      }elsif ($cityH{lc($words[$i])}) { #If not neutral class
		$words[$i] = ":city:";
		#	    $words[$i] = ":addr:";
		$addrNum++;
	      }elsif ($stateH{lc($words[$i])}) {
		$words[$i] = ":state:";
		#	    $words[$i] = ":addr:";
		$addrNum++;
	      }elsif ($countryH{lc($words[$i])}) {
		$words[$i] = ":country:";
		#	    $words[$i] = ":addr:";
		$addrNum++;
	      }elsif ($nameH{lc($words[$i])}) { # end with not neutral class
		$words[$i] = ":MayName:";
		$Cap1NonDictWordNum ++;
	      }elsif ( $dictH{lc($words[$i])}) {
		$words[$i] = ":Cap1DictWord:";
		$Cap1DictWordNum ++;		
	      }elsif ($words[$i] =~ /\W+|\-/) { #like BU-CS-93-015; maybe the length could be relaxed; I add \W+ here!!!
		my @Parts = split(/\W+|\-/, $words[$i]);
		for $i(0 .. $#Parts) {
		  if ($Parts[$i] =~ /^[\p{IsLower}]+$/) {
		    my $len = length($Parts[$i]);
		    #	      $Parts[$i] = "\:LowerWord"."$len"."\:";
		    $Parts[$i] = "\:LowerWords\:";
		  }elsif ($Parts[$i] =~ /^[\p{IsUpper}]+$/) {
		    my $len = length($Parts[$i]);
		    #	      $Parts[$i] = "\:CapWord"."$len"."\:";
		    $Parts[$i] = "\:CapWords\:";
		  }elsif ($Parts[$i] =~ /^\d+$/) {
		    my $len = length($Parts[$i]);
		    #	      $Parts[$i] = "\:Dig\[$len\]\:";
		    $Parts[$i] = "\:Digs\:";
		  }elsif ($Parts[$i] =~ /^[\p{IsUpper}\p{IsLower}]+$/) {
		    $Parts[$i] = "\:MixCaseWords\:";
		  }else {
		    my $len = length($Parts[$i]);
		    $Parts[$i] = "\:Mix\[$len\]\:";
		  }
		}
		$words[$i] = join("\-", @Parts);
	      }elsif ($words[$i] =~ /^[\p{IsUpper}]+$/) {
		my $len = length($words[$i]);
		$words[$i] = "\:CapWord"."$len"."\:";
		#	      $words[$i] = "\:CapWords\:";
	      }else {
		$words[$i] = ":Cap1NonDictWord:";
		$Cap1NonDictWordNum ++;
	      }
	    }
	  }#end with else neutral
	}elsif ($words[$i] =~ /^[\p{IsLower}]/) { # small case letter 1
	  if (exists ($phoneH{$words[$i]})) {
	    $words[$i] = ":phone:";
	    $phoneNum++;
          }elsif (exists ($monthH{lc($words[$i])})) {
	    $words[$i] = ":month:";
	    $dateNum++;
	  }elsif ($keywordH{lc($words[$i])}) {
	    $words[$i] = ":keyword:";
	  }elsif (exists $dictH{lc($words[$i])}) { 
	    $words[$i] = ":DictWord:";
	    $DictWordNum ++;		
	  }else {# should consider the mixure of digit and letters
	    $words[$i] = ":NonDictWord:";
	    $NonDictWordNum ++;
	  } 
	}elsif ($words[$i] =~ /^[\d\-]+$/) { #like 30332-0280 or 1111
	  my $newword = $words[$i];
	  while ($words[$i] =~ /(\d+)/g) {
	    my $dig = $1;
	    my $diglen = length($dig);
	    $newword =~ s/$dig/ \:Dig\[$diglen\]\: /;
	  }
	  $words[$i] = $newword;
	  $digitNum++;
	}elsif ($words[$i] =~ /^(\W+)(.*)$/) { #start from a non-word character
	  my $nonword = $1;
	  my $rest = $2;
	  $words[$i] = $nonword;
	  while (length($rest) > 0) {
	    if ($rest =~ /^([\p{IsUpper}]+)(.*)$/) {
	      my $tmp = $1;
	      $rest = $2;
	      $words[$i] .= "\:CapWords\:".length($tmp); #length may be relaxed
	    }elsif ($rest =~ /^([\p{IsLower}]+)(.*)$/) {
	      my $tmp = $1;
	      $rest = $2;
	      $words[$i] .= "\:LowerWords\:".length($tmp);
	    }elsif ($rest =~ /^(\d+)(.*)$/) {
	      my $tmp = $1;
	      $rest = $2;
	      $words[$i] .= "\:Digs\:".length($tmp);
	    }else { #get the head character
	      my $restLen = length($rest);
	      $restLen--;
	      $words[$i] .= substr($rest, 0, 1);
	      $rest = substr($rest, 1, $restLen);
	    }
	  }
	}else {
	  $others++;
	}
      }
    }else {
      #      print " already token or punctuation\: $words[$i] \n";
    }
  }
  
  for my $i(0 .. $#words) {
#      if (exists ($$FeatureDictH{$words[$i]}{ID})) {
	  $TestFeatureVecH{$words[$i]}++;
#      }
  }

  # here we add in the bigrams
  if (length($line) > 1) {
    for my $i(1 .. $#words) { #not good for (0 .. $#words-1) soemtimes
      my $pre = $words[$i-1];
      my $now = $words[$i];
      # add bigram into dict and train or test vector
#      if (exists ($$FeatureDictH{"$pre $now"}{ID})) {
	$TestFeatureVecH{"$pre $now"}++;
#      }
    } # end with bigram features
  }
  
  # try to normalize using F1
  $TestFeatureVecH{CsenLen} = $senLen;
  if ($senLen > 0) {
    $TestFeatureVecH{CdateNumPer} =  sprintf("%.8f", $dateNum/$senLen);
    $TestFeatureVecH{CDictWordNumPer} = sprintf("%.8f", $DictWordNum/$senLen);
    $TestFeatureVecH{CNonDictWordNumPer} =  sprintf("%.8f", $NonDictWordNum/$senLen);
    $TestFeatureVecH{CCap1DictWordNumPer} = sprintf("%.8f", $Cap1DictWordNum/$senLen);
    $TestFeatureVecH{CCap1NonDictWordNumPer} = sprintf("%.8f", $Cap1NonDictWordNum/$senLen);
    $TestFeatureVecH{CdigitNumPer} = sprintf("%.8f", $digitNum/$senLen);
    $TestFeatureVecH{CaffiNumPer} = sprintf("%.8f", $affiNum/$senLen);
    $TestFeatureVecH{CaddrNumPer} = sprintf("%.8f", $addrNum/$senLen);
    $TestFeatureVecH{CintroNumPer} = sprintf("%.8f",$introNum/$senLen);
    $TestFeatureVecH{CphoneNumPer} = sprintf("%.8f",$phoneNum/$senLen);
    $TestFeatureVecH{CdegreeNumPer} = sprintf("%.8f",$degreeNum/$senLen);
    $TestFeatureVecH{CpubNumPer} = sprintf("%.8f",$pubNum/$senLen);
    $TestFeatureVecH{CnoteNumPer} = sprintf("%.8f",$noteNum/$senLen);
    $TestFeatureVecH{CpageNumPer} = sprintf("%.8f",$pageNum/$senLen);
    $TestFeatureVecH{CcapNumPer} = sprintf("%.8f",$capNum/$senLen);
    $TestFeatureVecH{CothersPer} =  sprintf("%.8f", $others/$senLen);
    #$TestFeatureVecH{ClinePos} = sprintf("%.8f", $linePos);
  }else {
    #print "null line\: $line \n";
  }
  
  if ($FiletoPrint ne "") { 
      open(PFH, ">$FiletoPrint") || die "SVMHeaderParse: could not open $FiletoPrint to write: $!";
      print PFH "$label ";
  }

  if (0) {
  my $SVMFeaVec = ""; #this is a string 
  foreach my $feature (sort {$$FeatureDictH{$a}{ID} <=> $$FeatureDictH{$b}{ID}} keys %TestFeatureVecH) {
  if ($TestFeatureVecH{$feature} != 0){
     if ($norm) {
	if ($$FeatureDictH{$feature}{max} != 0) {
	# print "feature: $TestFeatureVecH{$feature} ; dict $$FeatureDictH{$feature}{max} => ";
	  my $tmpval =  sprintf("%.8f", $TestFeatureVecH{$feature}/$$FeatureDictH{$feature}{max});
	  $TestFeatureVecH{$feature} = $tmpval;
	  #print "  $TestFeatureVecH{$feature} \n";
	}else {
	  #print "zero max\: $feature \n";
	}
      }
      if ($FiletoPrint ne "") {
      	print PFH "$$FeatureDictH{$feature}{ID}\:$TestFeatureVecH{$feature} ";
      }
      $SVMFeaVec .= "$$FeatureDictH{$feature}{ID}\:$TestFeatureVecH{$feature} ";
   }else {
     #print "zero value\: $feature ($TestFeatureVecH{$feature}) \n";
   }
  }
}

  
  if ($FiletoPrint ne "") {
      print PFH "\n";
      close(PFH); 
  }

  my $convertedStr = join(" ", @words);
  return(\%TestFeatureVecH);
  # return($SVMFeaVec);
  #return($convertedStr);
}


sub WordFeatureRepre() {
  my $line = shift;
  my $dict = shift; 
  my @FeatureLine;


  return(\@FeatureLine);
}

#Given a line, make the space explicit
sub FillSpace() { #recognize <<sep>>, instead of <sep>
  my $content = shift;
  my $lineNO = 0;

  $content =~ s/\s+<<sep>>/<<sep>>/g;
  $content =~ s/<<\/sep>>\s+/<<\/sep>>/g;

  my $punc = 0; # space is the only separator
  if ($content =~ /([^\p{IsLower}\p{IsUpper}\s+\-\.\d+(<<sep>>)(<<\/sep>>)])|(\W+and\W+)/ig)  {
      $punc = 1; #contains punctuation
  }

  my @Seq = split(/(<<sep>>[^\<\>]*<<\/sep>>)/, $content); #the () keeps the spliter in the array @Seq!
  for my $i (0 .. $#Seq) {
      if ($Seq[$i] =~ /<<sep>>/) {
	  #print "spliter\: $Seq[$i] \n";
      }else {
	  #this is the place to separate the punctuations and fill the space
#	  print "before removing the space $Seq[$i] \n";
	  $Seq[$i] =~ s/\s+<<sep>>/<<sep>>/g;
	  $Seq[$i] =~ s/<<\/sep>>\s+/<<\/sep>>/g;
	  # remove space arround punctuations 
	  $Seq[$i] =~ s/\s+/ \<space\> /g;
	  $Seq[$i] =~ s/<space>\s+(\W+)\s+<space>/ $1 /g;
#	  print "after  removing the space $Seq[$i]\n";
      }
  }
  
  $content = join(" ", @Seq);
  return($punc, $content);  
}


#Given a line, make the space explicit
sub OfflineFillSpace() { #recognize <sep>
  my $content = shift;
  my $lineNO = 0;

  $content =~ s/\s+<sep>/<sep>/g;
  $content =~ s/<\/sep>\s+/<\/sep>/g;

  my $punc = 0; # space is the only separator
  if ($content =~ /([^\p{IsLower}\p{IsUpper}\s+\-\.\d+(<sep>)(<\/sep>)])|(\W+and\W+)/ig)  {
      $punc = 1; #contains punctuation
  }

  my @Seq = split(/(<sep>[^\<\>]*<\/sep>)/, $content); #the () keeps the spliter in the array @Seq!
  for my $i (0 .. $#Seq) {
      if ($Seq[$i] =~ /<sep>/) {
	  #print "spliter\: $Seq[$i] \n";
      }else {
	  #this is the place to separate the punctuations and fill the space
#	  print "before removing the space $Seq[$i] \n";
	  $Seq[$i] =~ s/\s+<sep>/<sep>/g;
	  $Seq[$i] =~ s/<\/sep>\s+/<\/sep>/g;
	  # remove space arround punctuations 
	  $Seq[$i] =~ s/\s+/ \<space\> /g;
	  $Seq[$i] =~ s/<space>\s+(\W+)\s+<space>/ $1 /g;
#	  print "after  removing the space $Seq[$i]\n";
      }
  }
  
  $content = join(" ", @Seq);
  return($punc, $content);  
}


sub SeparatePunc0108bak() {
  my $line = shift;

  #added 12/16
  $line =~ s/^\s+//g;
  $line =~ s/\s+$//g;

  $line =~ s/([^\p{IsLower}\p{IsUpper}\s+\-\d+\<\>\.]\s+)/ $1 /g;
  $line =~ s/[\w+]{3,}(\.)\s+/ $1 /g;
  $line =~ s/\s+/ /;
 
  return($line);
}

sub SeparatePunc() {
  my $line = shift;

  $line =~ s/^\s+//g;
  $line =~ s/\s+$//g;

  $line =~ s/([^\p{IsLower}\p{IsUpper}\s+\-\d+\<\>\.]\s+)/ $1 /g;
  $line =~ s/([\w+]{3,})(\.)\s+/$1 $2 /g; #"Dept. of" becomes "Dept . of"
  #How about blah, blah, ... blah. And ....
  #Dr. Smith will be keep the small dot
  #Sep. will keep the small dot as well.
  #But how about removing every dot, including Dr. and Sep. ?

#  $line =~ s/\W+$//g; #remove last punctuation
  $line =~ s/\s+/ /;

  return($line);
}


sub weired_author(){
    my $str = shift;
    
    my $weired = 0;
    my %weired_words = (
			'Departamento' =>1,
			'IN PRESS'=>1,
			'PRESS'=>1,
			'Center'=>1,
			'Ltd' =>1,
			'Universidad'=>1,
			'chair' =>1,
			'Submitted'=>1,
			'pp'=>1,
			'Version'=>1,
			'Thesis' =>1,
			'Proposal' =>1,
			'University'=>1,
			'Universiteit'=>1,
			'Institut'=>1,
			'extended'=>1,
			'abstract'=>1,
			'Laboratoire'=>1,
			'COVER PAGE'=>1,
			'COVER'=>1,
			'Page' => 1,
			'Job Title'=>1,
			'Job'=>1,
			'Title'=>1,
			'Case Study'=>1,
			'Case Sludy'=>1,
			'Case'=>1,
			'Report'=>1,
			'Reply'=>1,
			'A Report'=>1,
			'A Reply'=>1,
			'Research'=>1,
			'Paper'=>1,
			'Research Paper'=>1,
			'Research Project'=>1,
			'Project'=>1,
			'Retrospective'=>1,
			'Roadmap'=>1,
			'Tutorial'=>1,
			'WORKING PAPER'=>1,
			'Working' =>1,
			'White Paper'=>1,
			'in honor of'=>1,
			'international' =>1,
			'Dataset' =>1,
			'Sample' =>1,
			'Network'=>1,
			'Networks'=>1,
			'Academiae'=>1,
			'company'=>1,
			'Submitted'=>1,
			);

    my %filter_words = (
			'honor'=>1,
			'ed'=>1,
			'eds'=>1,
			'jr'=>1,
			'jr\.'=>1,
			'authors'=>1,
			'author' =>1,
			'editor'=>1,
			'editors'=>1,
			'with'=>1,
			'by'=>1,
			);
    
    #if separate authors into individuals.
    ##    $str =~ s/^\s*[^\p{IsLower}\p{IsUpper}\d\-\.]//g;
#    $str =~ s/[^\p{IsLower}\p{IsUpper}\d\-\.]\s*$//g;

    my @weired_words_arr = keys %weired_words;   
    my $weired_words_str = join("|", @weired_words_arr);
    
    #print "\n\nbefore: $str\n";
    $str =~ s/\./\. /g;
    $str =~ s/\d+//g;
    $str =~ s/^\s*\W+//g;
    $str =~ s/\W+\s*$//g;
    $str =~ s/\s+/ /g;
    $str = &str_space_clean($str);
    #print "after: $str \n"; 
    
    my @words = split(/\s+/, $str);
    my $lcase_num = 0;
    my $weired_form = 0;
    my @new_name = ();
    my $pure_single_letter = 1;
    for my $i(0 .. $#words) {
	if ( (length($words[$i]) > 1) && ($words[$i] !~ /^\w\.$/)) {
	    $pure_single_letter = 0;
	}
	if ($filter_words{lc($words[$i])} || ($words[$i] !~ /\w/)) {
	    next;
	}else {
	    if ($words[$i] =~ /^[\p{IsLower}\-]+$/) {
		$lcase_num++;
	    }elsif ($words[$i] =~ /[^\p{IsLower}\p{IsUpper}\-\.]/) {
		$weired_form++;
	    }
	    #make the first letter capitalized
	    $words[$i] = ucfirst(lc($words[$i])); 
	    push @new_name, $words[$i];
	}
    }
    if (($pure_single_letter) || ($str =~ /$weired_words_str/) || ($#words > 4) || ($#new_name <1) || (($#words +1 - $weired_form) < 2) || ($lcase_num>2)) {
	$weired = 1;
    }
    #print "weired:? $weired \n";
    $str = join(' ', @new_name);
    #print "final str $str\n";
    
    return($weired, $str);
}


#turn array into hash map { $hash_name{$_} =$some_value } @array_name;
sub hash_stopwords {
    my $stopword = "$Database_Dir/stopwords";
    my %stopH = ();
    open(stopReader, "$stopword") || die "SVMHeaderParse: could not open $stopword to read \n";
    while (my $line = <stopReader>) {
	$line = &str_space_clean($line);
	$stopH{$line}++;
    }
    close(stopReader);
    return(\%stopH);
}

sub hash_affi_stopwords {
    #my $DB_dir = "/home/hhan/projects/public_library/DB";
    my $stopword = "$Database_Dir/affi.txt";
    my %stopH = ();
    open(stopReader, "$stopword") || die "SVMHeaderParse: could not open $stopword to read \n";
    while (my $line = <stopReader>) {
	$line = &str_space_clean($line);
	$line =~ s/^\d+\s+//g;
	$stopH{lc($line)}++;
    }
    close(stopReader);
    return(\%stopH);
}

sub hash_nickname{
    #my $DB_dir = "/home/hhan/projects/public_library/DB";
    my $stopword = "$Database_Dir/nickname.txt";
    my %stopH = ();
    open(stopReader, "$stopword") || die "SVMHeaderParse: could not open $stopword to read \n";
    while (my $line = <stopReader>) {
	$line = &str_space_clean($line);
	my @names = split(/<>|\s*\,\s*/, $line);
	for my $i(1 .. $#names) {
	    $stopH{lc($names[0])}{lc($names[$i])} = 1;
	}
    }
    close(stopReader);
    return(\%stopH);
}

sub hash_statewords {
    #my $DB_dir = "/home/hhan/projects/public_library/DB";
    my $stopword = "$Database_Dir/statename.txt";
    my %stopH = ();
    open(stopReader, "$stopword") || die "SVMHeaderParse: could not open $stopword to read \n";
    while (my $line = <stopReader>) {
	$line = &str_space_clean($line);
	my ($state, $abbr) = split(/\s*\,\s*/, $line);
	$stopH{$abbr} = $state;
    }
    close(stopReader);
    return(\%stopH);
}


sub hash_addrwords {
    #my $DB_dir = "/home/hhan/projects/public_library/DB";
    my $stopword = "$Database_Dir/addr.txt";
    my %stopH = ();
    open(stopReader, "$stopword") || die "SVMHeaderParse: could not open $stopword to read \n";
    while (my $line = <stopReader>) {
	$line = &str_space_clean($line);
	$line =~ s/^\d+\s+//g;
	$stopH{lc($line)}++;
    }
    close(stopReader);
    return(\%stopH);
}

sub str_space_clean() {
    my $str = shift;

    $str =~ s/\s+/ /g;
    $str =~ s/^\s+//g;
    $str =~ s/\s+$//g;
    return($str);
}

sub nfreeze_hash_to_file() {
    my $H = shift;
    my $F = shift;

    my $mystring = nfreeze($H);
    open(dumpFH,  ">$F") || die "SVMHeaderParse: could not open $F to write: $!";
    print dumpFH "$mystring";
    close(dumpFH);
}

sub dump_hash_to_file() {
    my $H = shift;
    my $F = shift;

    $d = Data::Dumper->new([$H]);
    $mystring = $d->Dump;
    open(dumpFH, ">$F") || die "SVMHeaderParse: could not open $F to write: $!";
    print dumpFH "$mystring";
    close(dumpFH);
}


sub read_hash_from_file() {
  my $file = shift;

  undef $/;
  open(dumpFH, "$file") || die "SVMHeaderParse: could not open $file to read. \n";
  my $string = <dumpFH>;
  close(dumpFH);
  $/ = "\n";

  eval($string);
  return($VAR1);
}

sub thaw_hash_from_file() {
  my $file = shift;

  undef $/;
  open(dumpFH, "$file") || die "SVMHeaderParse: could not open $file to read. \n";
  my $string = <dumpFH>;
  close(dumpFH);
  $/ = "\n";

  my $VAR1 = thaw($string);
  return($VAR1);
}

sub rand_split_samples_to2parts() {
    my $samples = shift; #array
    my $ratio = shift;

    my $total_num = $#$samples;
    my $num1 = int($total_num*$ratio);
    my $num2 = $total_num - $num1;
    my (@part1, @part2);
    print STDERR "rand_split_samples_to2parts\: $ratio of $total_num is $num1\n";
    $t=time;
    srand($t); #seed
    for($j=$total_num;$j>=0;$j--){
        $r=int(rand($j));
        if (($total_num - $j) < $num1) {
            push @part1, $$samples[$r];
            #adjust the samples after the selected one
            for my $k($r .. $#$samples-1) {
                $$samples[$k] = $$samples[$k+1];
            }
            pop @$samples;
        }else {
            push @part2, $$samples[$r];
        }
    }
    return(\@part1, \@part2);
}

sub rand_split_samples_to2parts_v2() {
    my $samples = shift; #array
    my $ratio = shift;

    my $total_num = $#$samples;
    my $num1 = int($total_num*$ratio);
    my $num2 = $total_num - $num1;
    my (@part1, @part2);
    print STDERR "rand_split_samples_to2parts\: $ratio of $total_num is $num1\n";
    $t=time;
    srand($t); #seed
    for($j=$total_num;$j>=0;$j--){
        $r=int(rand($j));
        if (($total_num - $j) < $num1) {
            push @part1, $$samples[$r];
            #adjust the samples after the selected one
            for my $k($r .. $#$samples-1) {
                $$samples[$k] = $$samples[$k+1];
            }
            pop @$samples;
        }
    }
    return(\@part1, $samples);
}

sub rand_split_samples_toNparts() {
    my $samples = shift; #array
    my $fold = shift;

    my $total_num = $#$samples;
    my $unit = int($total_num/$fold +1);
    my $last_fold = $total_num - $unit*($fold-1);

    my @data = ();
    $t=time;
    srand($t); #seed
    for($j=$total_num;$j>=1;$j--){
        $r=int(rand($j));
	my $subfold = int(($total_num - $j)/$unit) + 1;
	push @{$data[$subfold]}, $$samples[$r];
	#adjust the samples after the selected one
	for my $k($r .. $#$samples-1) {
	    $$samples[$k] = $$samples[$k+1];
	}
	pop @$samples;
    }
    return(@data);
}

sub rand_split_hash_index_toNparts() {
    my $sample_hash = shift; #hash
    my $fold = shift;

    my @sample_arr = keys %{$sample_hash};
    my $total_num = $#sample_arr;
    my $unit = int($total_num/$fold +1);
    my $last_fold = $total_num - $unit*($fold-1);

    my @data = ();
    $t=time;
    srand($t); #seed
    for($j=$total_num;$j>=1;$j--){
        $r=int(rand($j));
	my $subfold = int(($total_num - $j)/$unit) + 1;
	
	my $name = $sample_arr[$r];
	my %pos = ();
	if ($$sample_hash{$name}{label} > -1) {
	    my @tmp = split(/\<\>/, $$sample_hash{$name}{label});
	    map { $pos{$_} =1 } @tmp;
	}
	foreach my $file_name (keys %{$$sample_hash{$name}{name}}) {
	    my ($tmp, $num) = split(/\_\_/, $file_name);
	    my $label = "-1";
	    if ($pos{$num}) {
		$label = "+1";
	    }
	    push @{$data[$subfold]}, "$label<>$file_name<>$$sample_hash{$name}{name}{$file_name}{snippet}";
	}
	
	for my $k($r .. $#sample_arr-1) {
	    $sample_arr[$k] = $sample_arr[$k+1];
	}
	pop @sample_arr;
    }
    return(@data);
}

sub ExtractBinaryNfoldSVMResult() {
    my $in = shift;
    my %ResultH = ();

    open (inFH, "$in") || die "SVMHeaderParse: could not open $in to read \n";
    while (my $line = <inFH>) {
	if ($line =~ /Accuracy on test set: (\d+\.\d+)\%/) {
	    $ResultH{A}{count}++;
	    $ResultH{A}{sum} += $1;
	}
	if ($line =~ /Precision\/recall on test set\: (.*)\%\/(.*)\%/) {
	    my $P = $1;
	    my $R = $2;
	    if ($P =~ /\d+\.\d+/) {
		$ResultH{P}{count}++;
		$ResultH{P}{sum} += $P;
	    }
	    if ($R =~ /\d+\.\d+/) {
		$ResultH{R}{count}++;
		$ResultH{R}{sum} += $R;
	    }
	}
    }
    close(inFH);
    
    print STDERR "average result from cross validation \n";
    foreach my $eval(sort {$a <=> $b} keys %ResultH) {
        $ResultH{$eval}{avg} = sprintf("%.8f", $ResultH{$eval}{sum}/$ResultH{$eval}{count});
	print STDERR "evaluation($eval) -- $ResultH{$eval}{avg}\n";
    }
}    

## get alias file
sub GetNameVariations1() {
    my $personalName = shift; #like _Chris_S._Mellish__1.txt   
    
    my @QueryNameParts = split(/\s+|\-/, $personalName);
    my %NameVariations;
    my ($FirstName, $LastName, $FI, $LI, $MI1, $MI2, $FI_LI, $AllInitial, $AllName, $FILN, $FIMI1LastName, $FIMI1MI2LastName)
	= ('','','','','','','','','','','','','');
    
    $FirstName = $QueryNameParts[0];
    $LastName = $QueryNameParts[$#QueryNameParts];
    
    $NameVariations{$FirstName} = "FN";
    $NameVariations{$LastName} = "LN";
    
    $FI = substr($FirstName, 0, 1);
    $LI = substr($LastName, 0, 1);

    $FI_LI= "$FI"."$LI";
    $FILN = "$FI"."$LastName";
    $FNLI = "$FirstName"."$LI";
    $NameVariations{$FILN} = "FILN";
    $NameVariations{$FNLI} = "FNLI";
    
    for my $i(0 .. $#QueryNameParts) {
	$QueryNameParts[$i] =~ s/\W+//g;
	$AllName .= $QueryNameParts[$i];
    }
    # dependts on whether this name contains 3 parts or 4 parts
    if ($#QueryNameParts < 1) {next;}
    if ($#QueryNameParts eq 1) {
	$AllInitial = $FI_LI; 	
	$NameVariations{$AllInitial} = "all_initial";
    }else {
	$NameVariations{$FI_LI} = "FILI";
	$NameVariations{"$FN"."$QueryNameParts[1]"} = "FNMN";
	if ($#QueryNameParts eq 2) {
	    $MI1 = substr($QueryNameParts[1], 0, 1);
	    $AllInitial= "$FI"."$MI1"."$LI"; 
	    $FIMI1LastName = "$FI"."$MI1"."$LastName";
	    $NameVariations{$AllInitial} = "all_initial";
	    $NameVariations{$FIMI1LastName} = "FIMI1LN";
	}elsif ($#NameParts eq 3) {
	    $MI1 = substr($QueryNameParts[1], 0, 1);
	    $MI2 = substr($QueryNameParts[2], 0, 1); 
	    $AllInitial = "$FI"."$MI1"."$MI2"."$LI"; 
	    $FIMI1LastName = "$FI"."$MI1"."$LastName";
	    $FIMI1MI2LastName = "$FI"."$MI1"."$MI2"."$LastName";
	    $NameVariations{$AllInitial} = "all_initial";
	    $NameVariations{$FIMI1LastName} = "FIMI1LN";
	    $NameVariations{$FIMI1MI2LastName} = "FIMI1MI2LN";
	}
    }
    
    ## It will take chance for this exact match
    if (length ($QueryNameParts[$#QueryNameParts]) < 4) {
	$PartLastName =  substr($QueryNameParts[$#QueryNameParts], 0, 5);
	$NameVariations{$PartLastName} = "partial_LN";
    }
    return(\%NameVariations);	
}

sub GetNameVariations() {
    my $personalName = shift; #like _Chris_S._Mellish__1.txt   
    my $nickname = shift;
    
    my @QueryNameParts = split(/\s+|\-/, $personalName);
    my %NameVariations;
    my ($FirstName, $LastName, $FI, $LI, $MI1, $MI2, $FI_LI, $AllInitial, $AllName, $FILN, $FIMI1LastName, $FIMI1MI2LastName)
	= ('','','','','','','','','','','','','');
    
    $FirstName = $QueryNameParts[0];
    $LastName = $QueryNameParts[$#QueryNameParts];
# using first 5 letters decreases performance if not using substring matching
#    if (length($QueryNameParts[0]) > 4) {
#	$FirstName = substr($QueryNameParts[0],0,5);
#    }
#    if (length($QueryNameParts[$#QueryNameParts]) > 4) {
#	$LastName = substr($QueryNameParts[$#QueryNameParts],0,5);
#    }    
    $NameVariations{$FirstName} = "FN";
    $NameVariations{$LastName} = "LN";
    foreach my $alias(keys %{$$nickname{lc($FirstName)}}) {
	$NameVariations{$alias} = "FN";
    }
    $FI = substr($FirstName, 0, 1);
    $LI = substr($LastName, 0, 1);

    $FI_LI= "$FI"."$LI";
    $FILN = "$FI"."\\"."w*"."$LastName";
    $FNLI = "$FirstName"."$LI";
    $LNFI = "$LastName"."$FI";
    $NameVariations{$FILN} = "FILN";
    $NameVariations{"$FI"."\."."$LastName"} = "FILN";
    $NameVariations{$FNLI} = "FNLI";
    $NameVariations{$LNFI} = "LNFI";
    
    for my $i(0 .. $#QueryNameParts) {
	$QueryNameParts[$i] =~ s/\W+//g;
	$AllName .= $QueryNameParts[$i];
    }
    $NameVariations{$AllName} = "full_name";
    $NameVariations{"\\"."w*"."$FirstName"."\\"."w*"."$LastName"."\\"."w*"} = "FNLN";
    $NameVariations{"$FirstName"."\."."$LastName"} = "FNLN";
    $NameVariations{"$LastName"."$FirstName"} = "LNFN";

    # depends on whether this name contains 3 parts or 4 parts
    if ($#QueryNameParts < 1) {next;}
    elsif ($#QueryNameParts eq 1) {
	$AllInitial = $FI_LI; 	
	$NameVariations{$AllInitial} = "all_initial";
    }else {
	$NameVariations{$FI_LI} = "FILI";
	$NameVariations{"$FirstName"."$QueryNameParts[1]"} = "FNMN";
	if ($#QueryNameParts eq 2) {
	    $MI1 = substr($QueryNameParts[1], 0, 1);
	    $AllInitial= "$FI"."$MI1"."$LI"; 
	    $FIMI1LastName = "$FI"."$MI1"."$LastName";
	    $NameVariations{$AllInitial} = "all_initial";
	    $NameVariations{$FIMI1LastName} = "FIMI1LN";
	}elsif ($#QueryNameParts eq 3) {
	    $MI1 = substr($QueryNameParts[1], 0, 1);
	    $MI2 = substr($QueryNameParts[2], 0, 1); 
	    $AllInitial = "$FI"."$MI1"."$MI2"."$LI"; 
	    $FIMI1LastName = "$FI"."$MI1"."$LastName";
	    $FIMI2LI = "$FI"."$MI2"."$LI";
	    $FIMI1LI = "$FI"."$MI1"."$LI";
	    $FIMI1MI2LastName = "$FI"."$MI1"."$MI2"."$LastName";
	    $MN2LastName = "$QueryNameParts[2]"."$LastName";
	    $NameVariations{$AllInitial} = "all_initial";
	    $NameVariations{$FIMI1LastName} = "FIMI1LN";
	    $NameVariations{$FIMI1LI} = "FIMI1LI";
	    $NameVariations{$FIMI2LI} = "FIMI2LI";
	    $NameVariations{$MN2LastName} = "MI2LN";
	    $NameVariations{$FIMI1MI2LastName} = "FIMI1MI2LN";
	}
    }
    
    ## It will take chance for this exact match
    if (length ($QueryNameParts[$#QueryNameParts]) < 4) {
	$PartLastName =  substr($QueryNameParts[$#QueryNameParts], 0, 5);
	$NameVariations{$PartLastName} = "partial_LN";
    }
    return(\%NameVariations);	
}

sub get_university_emails() {
    my $univ = "$Database_Dir/university_list/univ-full.html";
    my $simple_format = "$Database_Dir/university_list.txt";

    my %H = ();
    open(UNIV, $univ) || die "SVMHeaderParse: could not open $univ to read. \n";
    my @content = <UNIV>;
    close(UNIV);
     
    open(simpleWriter, ">$simple_format") || die "SVMHeaderParse: could not open $simple_format to write: $!";
    for my $i(0 .. $#content) {
	if ($content[$i] =~ /\<LI\>\s+\<A\s+HREF\=\"([^\"]*)\"\>(.*)\<\/A\>/) {
	    my $url = $1;
	    my $college = $2;
	    print simpleWriter "$college<>$url\n";
	}
    }
    close(simpleWriter);
    return(\%H);
}

#input: an array of value
sub compute_std() {
    my $arr = shift;
    my $mean = 0;
    my $std = 0;
    
    #cal mean
    for my $i(0 .. $#$arr) {
	$mean += $$arr[$i];
    }
    $mean = sprintf("%.3f", $mean/($#$arr+1));

    for my $i(0 .. $#$arr) {
	$std += ($$arr[$i]-$mean)**2;
    }
    my $temp = sprintf("%.8f", $std/$#$arr);
    $std = sqrt($temp);
    
    return($mean, $std);
}


1;