added a (tricky, hidden) option --keep-spaces-inside-quotes. Allows a limited

kosloot · kosloot · commit 960e9abb3cc0 · 2025-02-08T14:35:29.000+01:00
use to keep quoted values together: "One quote", or ' keep this alive '
diff --git a/include/ucto/tokenize.h b/include/ucto/tokenize.h
@@ -45,6 +45,7 @@ namespace Tokenizer {
 
   const std::string Version();
   const std::string VersionName();
+  extern bool keep_quoted_spaces;
 
   enum TokenRole {
     NOROLE                      = 0,
@@ -273,6 +274,10 @@ namespace Tokenizer {
     bool setUndLang( bool b ){ bool r = und_language; und_language = b; return r; };
     bool getUndLang(){ return und_language; };
 
+    bool setKeepQuotedSpaces( bool b ){ bool r = keep_quoted_spaces;
+      keep_quoted_spaces = b; return r; };
+    bool getKeepQuotedSpaces() const { return keep_quoted_spaces; };
+
     const std::string& getInputClass( ) const { return inputclass; }
     const std::string setInputClass( const std::string& cls) {
       std::string res = inputclass;
diff --git a/src/tokenize.cxx b/src/tokenize.cxx
@@ -76,6 +76,7 @@ namespace Tokenizer {
 
   using namespace icu;
   using TiCC::operator<<;
+  bool keep_quoted_spaces = false;
 
   const UChar32 ZWJ = u'\u200D';
 
@@ -154,16 +155,41 @@ namespace Tokenizer {
   const UnicodeString type_unknown = "UNKNOWN";
   const UnicodeString type_unanalyzed = "UNANALYZED";
 
+  UnicodeString filter_ZCARON( const UnicodeString& in ){
+    UnicodeString result;
+    for ( int i=0; i < in.length(); ++i ){
+      UChar32 c = in[i];
+      if ( c == U'Ž' ){
+	c = ' ';
+      }
+      result += c;
+    }
+    return result;
+  }
+
   Token::Token( const UnicodeString& _type,
 		const UnicodeString& _s,
-		TokenRole _role, const string& _lang_code ):
-    type(_type), us(_s), role(_role), lang_code(_lang_code) {
+		TokenRole _role,
+		const string& _lang_code ):
+    type(_type), role(_role), lang_code(_lang_code) {
+    if ( keep_quoted_spaces ){
+      us = filter_ZCARON( _s );
+    }
+    else {
+      us = _s;
+    }
   }
 
   Token::Token( const UnicodeString& _type,
 		const UnicodeString& _s,
 		const string& _lang_code ):
-    type(_type), us(_s), role(NOROLE), lang_code(_lang_code) {
+    type(_type), role(NOROLE), lang_code(_lang_code) {
+    if ( keep_quoted_spaces ){
+      us = filter_ZCARON( _s );
+    }
+    else {
+      us = _s;
+    }
   }
 
 
@@ -2946,6 +2972,32 @@ namespace Tokenizer {
     }
   }
 
+  UnicodeString replace_quoted_spaces( const UnicodeString& in ){
+    UnicodeString result;
+    UChar32 quote = '\x0';
+    for ( int i=0; i < in.length(); ++i ){
+      UChar32 c = in[i];
+      //      cerr << "bekijk: " << UnicodeString( c ) << endl;
+      if ( c == '"' || c == '\'' ){
+	// found quote
+	//	cerr << "found quote!" << endl;
+	if ( c == quote ){
+	  // so a second one, reset
+	  quote = '\x0';
+	  //	  cerr << "reset quote!" << endl;
+	}
+	else {
+	  quote = c;
+	}
+      }
+      else if ( c == ' ' && quote != '\x0' ){
+	c = U'Ž'; // mark as  Ž
+      }
+      result += c;
+    }
+    return result;
+  }
+
   int TokenizerClass::internal_tokenize_line( const UnicodeString& originput,
 					      const string& _lang ){
     if ( originput.isBogus() ){ //only tokenize valid input
@@ -2971,6 +3023,9 @@ namespace Tokenizer {
 	  << originput << "] (language= " << lang << ")" << endl;
     }
     UnicodeString input = originput;
+    if ( keep_quoted_spaces ){
+      input = replace_quoted_spaces( input );
+    }
     if ( doFilter ){
       input = settings[lang]->filter.filter( input );
     }
@@ -3290,7 +3345,7 @@ namespace Tokenizer {
 	    else {
 	      if ( tokDebug >= 4 ){
 		DBG << "\trecurse, match changes the type:"
-				<< assigned_type << " to " << type << endl;
+		    << assigned_type << " to " << type << endl;
 	      }
 	      TokenRole role = (space ? NOROLE : NOSPACE);
 	      if ( paragraphsignal_next ){
diff --git a/src/ucto.cxx b/src/ucto.cxx
@@ -373,6 +373,11 @@ void runtime_opts::fill( TiCC::CL_Options& Opts ){
   pass_thru = Opts.extract( "passthru" );
   Opts.extract("normalize", norm_set_string );
   Opts.extract( "separators", separators );
+  Opts.extract( "keep-spaces-inside-quotes", keep_quoted_spaces );
+  if ( keep_quoted_spaces && quotedetection ){
+    throw TiCC::OptionError( "ucto: combining '--keep-spaces-inside-quotes' "
+			     "conflicts with '-Q'" );
+  }
   if ( Opts.extract( 'x', docid ) ){
     throw TiCC::OptionError( "ucto: The option '-x ID' is removed. "
 			     "Please use '-X' and '--id=ID' instead" );
@@ -713,6 +718,7 @@ int main( int argc, char *argv[] ){
 			   "help,detectlanguages:,uselanguages:,"
 			   "textredundancy:,add-tokens:,split,"
 			   "allow-word-corrections,ignore-tag-hints,"
+			   "keep-spaces-inside-quotes,"
 			   "separators:");
     Opts.init(argc, argv );
     if ( Opts.extract( 'h' )