Skip to content

Commit 960e9ab

Browse files
committed
added a (tricky, hidden) option --keep-spaces-inside-quotes. Allows a limited
use to keep quoted values together: "One quote", or ' keep this alive '
1 parent f94d7ef commit 960e9ab

File tree

3 files changed

+70
-4
lines changed

3 files changed

+70
-4
lines changed

include/ucto/tokenize.h

+5
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ namespace Tokenizer {
4545

4646
const std::string Version();
4747
const std::string VersionName();
48+
extern bool keep_quoted_spaces;
4849

4950
enum TokenRole {
5051
NOROLE = 0,
@@ -273,6 +274,10 @@ namespace Tokenizer {
273274
bool setUndLang( bool b ){ bool r = und_language; und_language = b; return r; };
274275
bool getUndLang(){ return und_language; };
275276

277+
bool setKeepQuotedSpaces( bool b ){ bool r = keep_quoted_spaces;
278+
keep_quoted_spaces = b; return r; };
279+
bool getKeepQuotedSpaces() const { return keep_quoted_spaces; };
280+
276281
const std::string& getInputClass( ) const { return inputclass; }
277282
const std::string setInputClass( const std::string& cls) {
278283
std::string res = inputclass;

src/tokenize.cxx

+59-4
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ namespace Tokenizer {
7676

7777
using namespace icu;
7878
using TiCC::operator<<;
79+
bool keep_quoted_spaces = false;
7980

8081
const UChar32 ZWJ = u'\u200D';
8182

@@ -154,16 +155,41 @@ namespace Tokenizer {
154155
const UnicodeString type_unknown = "UNKNOWN";
155156
const UnicodeString type_unanalyzed = "UNANALYZED";
156157

158+
UnicodeString filter_ZCARON( const UnicodeString& in ){
159+
UnicodeString result;
160+
for ( int i=0; i < in.length(); ++i ){
161+
UChar32 c = in[i];
162+
if ( c == U'Ž' ){
163+
c = ' ';
164+
}
165+
result += c;
166+
}
167+
return result;
168+
}
169+
157170
Token::Token( const UnicodeString& _type,
158171
const UnicodeString& _s,
159-
TokenRole _role, const string& _lang_code ):
160-
type(_type), us(_s), role(_role), lang_code(_lang_code) {
172+
TokenRole _role,
173+
const string& _lang_code ):
174+
type(_type), role(_role), lang_code(_lang_code) {
175+
if ( keep_quoted_spaces ){
176+
us = filter_ZCARON( _s );
177+
}
178+
else {
179+
us = _s;
180+
}
161181
}
162182

163183
Token::Token( const UnicodeString& _type,
164184
const UnicodeString& _s,
165185
const string& _lang_code ):
166-
type(_type), us(_s), role(NOROLE), lang_code(_lang_code) {
186+
type(_type), role(NOROLE), lang_code(_lang_code) {
187+
if ( keep_quoted_spaces ){
188+
us = filter_ZCARON( _s );
189+
}
190+
else {
191+
us = _s;
192+
}
167193
}
168194

169195

@@ -2946,6 +2972,32 @@ namespace Tokenizer {
29462972
}
29472973
}
29482974

2975+
UnicodeString replace_quoted_spaces( const UnicodeString& in ){
2976+
UnicodeString result;
2977+
UChar32 quote = '\x0';
2978+
for ( int i=0; i < in.length(); ++i ){
2979+
UChar32 c = in[i];
2980+
// cerr << "bekijk: " << UnicodeString( c ) << endl;
2981+
if ( c == '"' || c == '\'' ){
2982+
// found quote
2983+
// cerr << "found quote!" << endl;
2984+
if ( c == quote ){
2985+
// so a second one, reset
2986+
quote = '\x0';
2987+
// cerr << "reset quote!" << endl;
2988+
}
2989+
else {
2990+
quote = c;
2991+
}
2992+
}
2993+
else if ( c == ' ' && quote != '\x0' ){
2994+
c = U'Ž'; // mark as Ž
2995+
}
2996+
result += c;
2997+
}
2998+
return result;
2999+
}
3000+
29493001
int TokenizerClass::internal_tokenize_line( const UnicodeString& originput,
29503002
const string& _lang ){
29513003
if ( originput.isBogus() ){ //only tokenize valid input
@@ -2971,6 +3023,9 @@ namespace Tokenizer {
29713023
<< originput << "] (language= " << lang << ")" << endl;
29723024
}
29733025
UnicodeString input = originput;
3026+
if ( keep_quoted_spaces ){
3027+
input = replace_quoted_spaces( input );
3028+
}
29743029
if ( doFilter ){
29753030
input = settings[lang]->filter.filter( input );
29763031
}
@@ -3290,7 +3345,7 @@ namespace Tokenizer {
32903345
else {
32913346
if ( tokDebug >= 4 ){
32923347
DBG << "\trecurse, match changes the type:"
3293-
<< assigned_type << " to " << type << endl;
3348+
<< assigned_type << " to " << type << endl;
32943349
}
32953350
TokenRole role = (space ? NOROLE : NOSPACE);
32963351
if ( paragraphsignal_next ){

src/ucto.cxx

+6
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,11 @@ void runtime_opts::fill( TiCC::CL_Options& Opts ){
373373
pass_thru = Opts.extract( "passthru" );
374374
Opts.extract("normalize", norm_set_string );
375375
Opts.extract( "separators", separators );
376+
Opts.extract( "keep-spaces-inside-quotes", keep_quoted_spaces );
377+
if ( keep_quoted_spaces && quotedetection ){
378+
throw TiCC::OptionError( "ucto: combining '--keep-spaces-inside-quotes' "
379+
"conflicts with '-Q'" );
380+
}
376381
if ( Opts.extract( 'x', docid ) ){
377382
throw TiCC::OptionError( "ucto: The option '-x ID' is removed. "
378383
"Please use '-X' and '--id=ID' instead" );
@@ -713,6 +718,7 @@ int main( int argc, char *argv[] ){
713718
"help,detectlanguages:,uselanguages:,"
714719
"textredundancy:,add-tokens:,split,"
715720
"allow-word-corrections,ignore-tag-hints,"
721+
"keep-spaces-inside-quotes,"
716722
"separators:");
717723
Opts.init(argc, argv );
718724
if ( Opts.extract( 'h' )

0 commit comments

Comments
 (0)