Sha256: 0517b0c008cd927d08a3a592ce776749ff8f60116665be9e0423ead08dbfa40e

Contents?: true

Size: 1.57 KB

Versions: 45

Compression:

Stored size: 1.57 KB

Contents

/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */

package lingscope.algorithms;

import abner.Scanner;
import java.io.StringReader;

/**
 *
 * @author shashank
 */
public class AbnerTokenizer {
    ////////////////////////////////////////////////////////////////
    /**
       <p>Take raw text apply ABNER's built-in tokenization on it.
     */
    public static String tokenize(String s) {
	StringBuffer sb = new StringBuffer();
	try {
	    Scanner scanner = new Scanner(new StringReader(s));
	    String t;
	    while ((t = scanner.nextToken()) != null) {
		sb.append(t+" ");
		if (t.toString().matches("[?!\\.]"))
		    sb.append("\n");
	    }
	    return sb.toString();
	} catch (Exception e) {
	    System.err.println(e);
	}
	return sb.toString();
    }

    /**
     * Takes an input and splits the sentence by punctuations and spaces, then
     * stitches it back together with a space and returns
     * @param input the input string to process
     * @return processed input string, where all words and punctuations are
     * seperated by space
     */
    public static String splitTermsByPunctuation(String input) {
        if (input.isEmpty()) {
            return "";
        }
        input = input.replaceAll("\\n", " ");
        String ret = tokenize(input).trim();
        if (ret.matches(".*\\w\\.$")) { // If a space is not put between the period in the end, then introduce one
            ret += " .";
        }
        if (input.endsWith(".") && (!ret.endsWith("."))) {
            ret += " .";
        }
        return ret;
    }
}

Version data entries

45 entries across 45 versions & 1 rubygems

Version Path
abstractor-4.4.7 lib/lingscope/src/lingscope/algorithms/AbnerTokenizer.java
abstractor-4.4.6 lib/lingscope/src/lingscope/algorithms/AbnerTokenizer.java
abstractor-4.4.5 lib/lingscope/src/lingscope/algorithms/AbnerTokenizer.java
abstractor-4.4.4 lib/lingscope/src/lingscope/algorithms/AbnerTokenizer.java
abstractor-4.4.3 lib/lingscope/src/lingscope/algorithms/AbnerTokenizer.java
abstractor-4.4.2 lib/lingscope/src/lingscope/algorithms/AbnerTokenizer.java
abstractor-4.4.1 lib/lingscope/src/lingscope/algorithms/AbnerTokenizer.java
abstractor-4.4.0 lib/lingscope/src/lingscope/algorithms/AbnerTokenizer.java
abstractor-4.3.3 lib/lingscope/src/lingscope/algorithms/AbnerTokenizer.java
abstractor-4.3.2 lib/lingscope/src/lingscope/algorithms/AbnerTokenizer.java
abstractor-4.3.1 lib/lingscope/src/lingscope/algorithms/AbnerTokenizer.java
abstractor-4.3.0 lib/lingscope/src/lingscope/algorithms/AbnerTokenizer.java
abstractor-4.2.3 lib/lingscope/src/lingscope/algorithms/AbnerTokenizer.java
abstractor-4.2.2 lib/lingscope/src/lingscope/algorithms/AbnerTokenizer.java
abstractor-4.2.1 lib/lingscope/src/lingscope/algorithms/AbnerTokenizer.java
abstractor-4.2.0 lib/lingscope/src/lingscope/algorithms/AbnerTokenizer.java
abstractor-4.1.5 lib/lingscope/src/lingscope/algorithms/AbnerTokenizer.java
abstractor-4.1.4 lib/lingscope/src/lingscope/algorithms/AbnerTokenizer.java
abstractor-4.1.3 lib/lingscope/src/lingscope/algorithms/AbnerTokenizer.java
abstractor-4.1.2 lib/lingscope/src/lingscope/algorithms/AbnerTokenizer.java