package lingscope.drivers; import java.util.ArrayList; import java.util.List; import lingscope.io.AnnotatedSentencesIO; import lingscope.structures.AnnotatedSentence; /** * Merges two annotated files. Useful to merge a words scope file with a POS cue * file * @author shashank */ public class AnnotatedFilesMerger { /** * Merges the given wordsSentence and the given tagsSentence * @param wordsSentence * @param tagsSentence * @return */ public static AnnotatedSentence merge(AnnotatedSentence wordsSentence, AnnotatedSentence tagsSentence) { List words = wordsSentence.getWords(); List tags = tagsSentence.getTags(); int numTokens = words.size(); if (tags.size() != numTokens) { System.err.println("Skipping non-equal length sentences"); System.err.println("\tSentence 1: " + wordsSentence.getRawText()); System.err.println("\tSentence 2: " + tagsSentence.getRawText()); return null; } StringBuilder mergedSentence = new StringBuilder(); for (int j = 0; j < numTokens; ++j) { mergedSentence.append(" ").append(words.get(j)).append("|").append(tags.get(j)); } return new AnnotatedSentence(mergedSentence.substring(1)); } /** * * @param args * 0 - file 1: the file from which words will be taken * 1 - file 2: the file from which tags will be taken * 2 - output file path */ public static void main(String[] args) { List wordsSentences = AnnotatedSentencesIO.read(args[0]); List tagsSentences = AnnotatedSentencesIO.read(args[1]); int numSentences = tagsSentences.size(); List mergedSentences = new ArrayList(numSentences); for (int i = 0; i < numSentences; ++i) { AnnotatedSentence wordsSentence = wordsSentences.get(i); AnnotatedSentence tagsSentence = tagsSentences.get(i); AnnotatedSentence mergedSentence = merge(wordsSentence, tagsSentence); if (mergedSentence == null) { continue; } mergedSentences.add(mergedSentence); } AnnotatedSentencesIO.write(args[2], mergedSentences); } }