@@ -76,6 +76,7 @@ namespace Tokenizer {
76
76
77
77
using namespace icu ;
78
78
using TiCC::operator <<;
79
+ bool keep_quoted_spaces = false ;
79
80
80
81
const UChar32 ZWJ = u' \u200D ' ;
81
82
@@ -154,16 +155,41 @@ namespace Tokenizer {
154
155
const UnicodeString type_unknown = " UNKNOWN" ;
155
156
const UnicodeString type_unanalyzed = " UNANALYZED" ;
156
157
158
+ UnicodeString filter_ZCARON ( const UnicodeString& in ){
159
+ UnicodeString result;
160
+ for ( int i=0 ; i < in.length (); ++i ){
161
+ UChar32 c = in[i];
162
+ if ( c == U' Ž' ){
163
+ c = ' ' ;
164
+ }
165
+ result += c;
166
+ }
167
+ return result;
168
+ }
169
+
157
170
Token::Token ( const UnicodeString& _type,
158
171
const UnicodeString& _s,
159
- TokenRole _role, const string& _lang_code ):
160
- type (_type), us (_s), role (_role), lang_code (_lang_code) {
172
+ TokenRole _role,
173
+ const string& _lang_code ):
174
+ type (_type), role (_role), lang_code (_lang_code) {
175
+ if ( keep_quoted_spaces ){
176
+ us = filter_ZCARON ( _s );
177
+ }
178
+ else {
179
+ us = _s;
180
+ }
161
181
}
162
182
163
183
Token::Token ( const UnicodeString& _type,
164
184
const UnicodeString& _s,
165
185
const string& _lang_code ):
166
- type (_type), us (_s), role (NOROLE), lang_code (_lang_code) {
186
+ type (_type), role (NOROLE), lang_code (_lang_code) {
187
+ if ( keep_quoted_spaces ){
188
+ us = filter_ZCARON ( _s );
189
+ }
190
+ else {
191
+ us = _s;
192
+ }
167
193
}
168
194
169
195
@@ -2946,6 +2972,32 @@ namespace Tokenizer {
2946
2972
}
2947
2973
}
2948
2974
2975
+ UnicodeString replace_quoted_spaces ( const UnicodeString& in ){
2976
+ UnicodeString result;
2977
+ UChar32 quote = ' \x0 ' ;
2978
+ for ( int i=0 ; i < in.length (); ++i ){
2979
+ UChar32 c = in[i];
2980
+ // cerr << "bekijk: " << UnicodeString( c ) << endl;
2981
+ if ( c == ' "' || c == ' \' ' ){
2982
+ // found quote
2983
+ // cerr << "found quote!" << endl;
2984
+ if ( c == quote ){
2985
+ // so a second one, reset
2986
+ quote = ' \x0 ' ;
2987
+ // cerr << "reset quote!" << endl;
2988
+ }
2989
+ else {
2990
+ quote = c;
2991
+ }
2992
+ }
2993
+ else if ( c == ' ' && quote != ' \x0 ' ){
2994
+ c = U' Ž' ; // mark as Ž
2995
+ }
2996
+ result += c;
2997
+ }
2998
+ return result;
2999
+ }
3000
+
2949
3001
int TokenizerClass::internal_tokenize_line ( const UnicodeString& originput,
2950
3002
const string& _lang ){
2951
3003
if ( originput.isBogus () ){ // only tokenize valid input
@@ -2971,6 +3023,9 @@ namespace Tokenizer {
2971
3023
<< originput << " ] (language= " << lang << " )" << endl;
2972
3024
}
2973
3025
UnicodeString input = originput;
3026
+ if ( keep_quoted_spaces ){
3027
+ input = replace_quoted_spaces ( input );
3028
+ }
2974
3029
if ( doFilter ){
2975
3030
input = settings[lang]->filter .filter ( input );
2976
3031
}
@@ -3290,7 +3345,7 @@ namespace Tokenizer {
3290
3345
else {
3291
3346
if ( tokDebug >= 4 ){
3292
3347
DBG << " \t recurse, match changes the type:"
3293
- << assigned_type << " to " << type << endl;
3348
+ << assigned_type << " to " << type << endl;
3294
3349
}
3295
3350
TokenRole role = (space ? NOROLE : NOSPACE);
3296
3351
if ( paragraphsignal_next ){
0 commit comments