1 package org.backsource.utils.text;
2
3 import java.util.Vector;
4
5 /***
6 * Parse text into sentences.
7 */
8 public class SentenceParser
9 {
10 /***
11 * The default punctation characters.
12 */
13 public static final String DEFAULT_PUNCTATION_CHARS = ".!?;";
14
15 protected String punctationChars = DEFAULT_PUNCTATION_CHARS;
16
17 /***
18 * Trim space from sentences
19 */
20 protected boolean trimSpace = true;
21
22 /*** For a punctation to be recognized, a space is required
23 * to be the next character after the punctation
24 */
25 protected boolean requireSpaceAfterPunctation = true;
26
27
28 /*** Construct a new parser using the default punctation chars.
29 * The parser will be configured to remove white space
30 * at the beginning and end of sentences, and require a
31 * space to be present after any punctation.
32 */
33 public SentenceParser()
34 {
35 punctationChars = DEFAULT_PUNCTATION_CHARS;
36 requireSpaceAfterPunctation = true;
37 trimSpace = true;
38 }
39
40 /*** Construct a new parser using custom
41 * punctation chars.
42 * @param trimSpace remove space from start and end of sentences
43 * @param requireSpaceAfterPunctation only recognize punctations followed by a space
44 */
45 public SentenceParser(String punctationChars, boolean trimSpace, boolean requireSpaceAfterPunctation)
46 {
47 this.punctationChars = punctationChars;
48 this.trimSpace = trimSpace;
49 this.requireSpaceAfterPunctation = requireSpaceAfterPunctation;
50 }
51
52 /*** Construct a new parser using default punctation chars.
53 * @param trimSpace remove space from start and end of sentences
54 * @param requireSpaceAfterPunctation only recognize punctations followed by a space
55 */
56 public SentenceParser(boolean trimSpace, boolean requireSpaceAfterPunctation)
57 {
58 this.trimSpace = trimSpace;
59 this.requireSpaceAfterPunctation = requireSpaceAfterPunctation;
60 }
61
62 /*** Parse a string into sentences
63 */
64 public Vector parse(String data)
65 {
66 Vector v = new Vector();
67
68 if (data == null) {
69 return v;
70 }
71
72 int len = data.length();
73 int start = 0;
74 int lastEnd = 0;
75 for(int i=0; i<len; i++) {
76 char c = data.charAt(i);
77
78
79 if (punctationChars.indexOf(c) != -1) {
80
81
82 boolean validPunctation = true;
83 for (i=i+1; i < len; i++) {
84 c = data.charAt(i);
85 if (punctationChars.indexOf(c) == -1) {
86
87
88 if (requireSpaceAfterPunctation && (c != ' ')) {
89 validPunctation = false;
90 }
91 break;
92 }
93 }
94
95 if (validPunctation) {
96 if (trimSpace) {
97 v.add(data.substring(start, i).trim());
98 } else {
99 v.add(data.substring(start, i));
100 }
101
102 lastEnd = i;
103 start = i;
104 }
105 }
106 }
107
108 if (lastEnd != len) {
109 if (trimSpace) {
110 v.add(data.substring(start, len).trim());
111 } else {
112 v.add(data.substring(start, len));
113 }
114 }
115
116 return v;
117 }
118
119 /***
120 * Test/demonstration. Pars the first argument and print the result to System.out
121 */
122 public static void main(String[] argz)
123 {
124
125 System.out.println(argz[0]);
126 System.out.println("-- default settings --");
127
128 SentenceParser sp = new SentenceParser();
129 Vector v = sp.parse(argz[0]);
130
131 for(int i=0; i<v.size(); i++) {
132 System.out.println("sent='" + v.get(i) + "'");
133 }
134
135 System.out.println();
136 System.out.println("-- no space trim, space not required --");
137 sp = new SentenceParser(false, false);
138 v = sp.parse(argz[0]);
139
140 for(int i=0; i<v.size(); i++) {
141 System.out.println("sent='" + v.get(i) + "'");
142 }
143
144 }
145
146 }
147