View Javadoc

1   package org.backsource.utils.text;
2   
3   import java.util.Vector;
4   
5   /***
6    * Parse text into sentences.
7    */
8   public class SentenceParser
9   {
10  	/***
11  	 * The default punctation characters.
12  	 */
13  	public static final String DEFAULT_PUNCTATION_CHARS = ".!?;";
14  
15  	protected String punctationChars = DEFAULT_PUNCTATION_CHARS;
16  
17  	/***
18  	 * Trim space from sentences
19  	 */
20  	protected boolean trimSpace = true;
21  
22  	/*** For a punctation to be recognized, a space is required
23  	 * to be the next character after the punctation
24  	 */
25  	protected boolean requireSpaceAfterPunctation = true;
26  
27  
28  	/*** Construct a new parser using the default punctation chars.
29  	 * The parser will be configured to remove white space
30  	 * at the beginning and end of sentences, and require a
31  	 * space to be present after any punctation.
32  	 */
33  	public SentenceParser()
34  	{
35  		punctationChars = DEFAULT_PUNCTATION_CHARS;
36  		requireSpaceAfterPunctation = true;
37  		trimSpace = true;
38  	}
39  
40  	/*** Construct a new parser using custom
41  	 * punctation chars.
42  	 * @param trimSpace remove space from start and end of sentences
43  	 * @param requireSpaceAfterPunctation only recognize punctations followed by a space
44  	 */
45  	public SentenceParser(String punctationChars, boolean trimSpace, boolean requireSpaceAfterPunctation)
46  	{
47  		this.punctationChars = punctationChars;
48  		this.trimSpace = trimSpace;
49  		this.requireSpaceAfterPunctation = requireSpaceAfterPunctation;
50  	}
51  
52  	/*** Construct a new parser using default punctation chars.
53  	 * @param trimSpace remove space from start and end of sentences
54  	 * @param requireSpaceAfterPunctation only recognize punctations followed by a space
55  	 */
56  	public SentenceParser(boolean trimSpace, boolean requireSpaceAfterPunctation)
57  	{
58  		this.trimSpace = trimSpace;
59  		this.requireSpaceAfterPunctation = requireSpaceAfterPunctation;
60  	}
61  
62  	/*** Parse a string into sentences
63  	 */
64  	public Vector parse(String data)
65  	{
66  		Vector v = new Vector();
67  
68  		if (data == null) {
69  			return v;
70  		}
71  
72  		int len = data.length();
73  		int start = 0;
74                  int lastEnd = 0;
75  		for(int i=0; i<len; i++) {
76  			char c = data.charAt(i);
77  
78  			/* find start of punctation */
79  			if (punctationChars.indexOf(c) != -1) {
80  
81  				/* find end of punctation */
82  				boolean validPunctation = true;
83  				for (i=i+1; i < len; i++) {
84  					c = data.charAt(i);
85  					if (punctationChars.indexOf(c) == -1) {
86  
87  						/* punctation must be followed by a space */
88  						if (requireSpaceAfterPunctation && (c != ' ')) {
89  							validPunctation = false;
90  						}
91  						break;
92  					}
93  				}
94  
95  				if (validPunctation) {
96  					if (trimSpace) {
97  						v.add(data.substring(start, i).trim());
98  					} else {
99  						v.add(data.substring(start, i));
100 				}
101 
102 					lastEnd = i;
103 					start = i;
104 				}
105 			}
106 		}
107 
108 		if (lastEnd != len) {
109 			if (trimSpace) {
110 				v.add(data.substring(start, len).trim());
111 			} else {
112 				v.add(data.substring(start, len));
113 			}
114 		}
115 
116 		return v;
117 	}
118 
119 	/***
120 	 * Test/demonstration. Pars the first argument and print the result to System.out
121 	 */
122 	public static void main(String[] argz)
123 	{
124 
125 		System.out.println(argz[0]);
126 		System.out.println("-- default settings --");
127 
128 		SentenceParser sp = new SentenceParser();
129 		Vector v = sp.parse(argz[0]);
130 
131 		for(int i=0; i<v.size(); i++) {
132 			System.out.println("sent='" + v.get(i) + "'");
133 		}
134 
135                 System.out.println();
136 		System.out.println("-- no space trim, space not required --");
137 		sp = new SentenceParser(false, false);
138 		v = sp.parse(argz[0]);
139 
140 		for(int i=0; i<v.size(); i++) {
141 			System.out.println("sent='" + v.get(i) + "'");
142 		}
143 
144 	}
145 
146 }
147