Sha256: 0517b0c008cd927d08a3a592ce776749ff8f60116665be9e0423ead08dbfa40e
Contents?: true
Size: 1.57 KB
Versions: 45
Compression:
Stored size: 1.57 KB
Contents
/* * To change this template, choose Tools | Templates * and open the template in the editor. */ package lingscope.algorithms; import abner.Scanner; import java.io.StringReader; /** * * @author shashank */ public class AbnerTokenizer { //////////////////////////////////////////////////////////////// /** <p>Take raw text apply ABNER's built-in tokenization on it. */ public static String tokenize(String s) { StringBuffer sb = new StringBuffer(); try { Scanner scanner = new Scanner(new StringReader(s)); String t; while ((t = scanner.nextToken()) != null) { sb.append(t+" "); if (t.toString().matches("[?!\\.]")) sb.append("\n"); } return sb.toString(); } catch (Exception e) { System.err.println(e); } return sb.toString(); } /** * Takes an input and splits the sentence by punctuations and spaces, then * stitches it back together with a space and returns * @param input the input string to process * @return processed input string, where all words and punctuations are * seperated by space */ public static String splitTermsByPunctuation(String input) { if (input.isEmpty()) { return ""; } input = input.replaceAll("\\n", " "); String ret = tokenize(input).trim(); if (ret.matches(".*\\w\\.$")) { // If a space is not put between the period in the end, then introduce one ret += " ."; } if (input.endsWith(".") && (!ret.endsWith("."))) { ret += " ."; } return ret; } }
Version data entries
45 entries across 45 versions & 1 rubygems