fixed #1355 hitless_words for RT and PQ indexes; set index version to…

… 60; added hitless_words to SHOW INDEx SETTINGS statement; fixed RT index to save disk chunk in version 58 compatible format; added regressions to test 408
manticoresoftware · May 21, 2020 · d38409e · d38409e
1 parent 7cd878f
commit d38409e
Show file tree

Hide file tree

Showing 14 changed files with 452 additions and 90 deletions.
diff --git a/src/accumulator.h b/src/accumulator.h
@@ -95,7 +95,7 @@ class RtAccum_t
 	CSphVector<ReplicationCommand_t *> m_dCmd;
 
 	bool						m_bKeywordDict {true};
-	DictRefPtr_c			m_pDict;
+	DictRefPtr_c				m_pDict;
 	CSphDict *					m_pRefDict = nullptr; // not owned, used only for ==-matching
 
 
@@ -109,7 +109,7 @@ class RtAccum_t
 	void			Cleanup();
 
 	void			AddDocument ( ISphHits * pHits, const CSphMatch & tDoc, bool bReplace, int iRowSize, const char ** ppStr, const VecTraits_T<int64_t> & dMvas, const DocstoreBuilder_i::Doc_t * pStoredDoc );
-	RtSegment_t *	CreateSegment ( int iRowSize, int iWordsCheckpoint );
+	RtSegment_t *	CreateSegment ( int iRowSize, int iWordsCheckpoint, ESphHitless eHitless, const VecTraits_T<SphWordID_t> & dHitlessWords );
 	void			CleanupDuplicates ( int iRowSize );
 	void			GrabLastWarning ( CSphString & sWarning );
 	void			SetIndex ( RtIndex_i * pIndex );

diff --git a/src/indexsettings.cpp b/src/indexsettings.cpp
@@ -854,6 +854,15 @@ void CSphIndexSettings::Format ( SettingsFormatter_c & tOut, FilenameBuilder_i *
 	tOut.Add ( "index_token_filter",	m_sIndexTokenFilter,	!m_sIndexTokenFilter.IsEmpty() );
 	tOut.Add ( "attr_update_reserve",	m_tBlobUpdateSpace,		m_tBlobUpdateSpace!=DEFAULT_ATTR_UPDATE_RESERVE );
 
+	if ( m_eHitless==SPH_HITLESS_ALL )
+	{
+		tOut.Add ( "hitless_words",		"all",					true );
+	} else if ( m_eHitless==SPH_HITLESS_SOME )
+	{
+		CSphString sHitlessFiles = FormatPath ( m_sHitlessFiles, pFilenameBuilder );
+		tOut.Add ( "hitless_words",		sHitlessFiles,			true );
+	}
+
 	DocstoreSettings_t::Format ( tOut, pFilenameBuilder );
 }
 
@@ -948,6 +957,21 @@ bool IndexSettingsContainer_c::AddOption ( const CSphString & sName, const CSphS
 		return true;
 	}
 
+	if ( sName=="hitless_words" && ( sValue!="none" && sValue!="all" ) )
+	{
+		RemoveKeys ( sName );
+		m_dHitlessFiles.Reset();
+		StrVec_t dValues = SplitArg ( sValue, m_dHitlessFiles );
+
+		// need only names for hitless files
+		StringBuilder_c sTmp ( " " );
+		for ( const CSphString & sVal : dValues )
+			sTmp << sVal;
+
+		return Add ( sName, sTmp.cstr() );
+
+	}
+
 	return Add ( sName, sValue );
 }
 
@@ -1059,6 +1083,9 @@ StrVec_t IndexSettingsContainer_c::GetFiles() const
 			dFiles.Add(j);
 	}
 
+	for ( const auto & i : m_dHitlessFiles )
+		dFiles.Add ( i );
+
 	return dFiles;
 }
 

diff --git a/src/indexsettings.h b/src/indexsettings.h
@@ -338,6 +338,7 @@ class IndexSettingsContainer_c
 	StrVec_t		m_dStopwordFiles;
 	StrVec_t		m_dExceptionFiles;
 	StrVec_t		m_dWordformFiles;
+	StrVec_t		m_dHitlessFiles;
 	CSphString		m_sError;
 };
 

diff --git a/src/sphinx.cpp b/src/sphinx.cpp
@@ -1984,8 +1984,6 @@ class CSphIndex_VLN : public CSphIndex, public IndexUpdateHelper_c, public Index
 private:
 	mutable CSphIndexProgress		m_tProgress;
 
-	bool						LoadHitlessWords ( CSphVector<SphWordID_t> & dHitlessWords );
-
 private:
 	int64_t						m_iDocinfo;				///< my docinfo cache size
 	int64_t						m_iDocinfoIndex;		///< docinfo "index" entries count (each entry is 2x docinfo rows, for min/max)
@@ -9773,6 +9771,7 @@ void SaveIndexSettings ( CSphWriter & tWriter, const CSphIndexSettings & tSettin
 	tWriter.PutString ( tSettings.m_sIndexTokenFilter );
 	tWriter.PutOffset ( tSettings.m_tBlobUpdateSpace );
 	tWriter.PutDword ( tSettings.m_iSkiplistBlockSize );
+	tWriter.PutString ( tSettings.m_sHitlessFiles );
 }
 
 
@@ -10277,46 +10276,30 @@ bool CSphIndex_VLN::RelocateBlock ( int iFile, BYTE * pBuffer, int iRelocationSi
 }
 
 
-bool CSphIndex_VLN::LoadHitlessWords ( CSphVector<SphWordID_t> & dHitlessWords )
+bool LoadHitlessWords ( const CSphString & sHitlessFiles, ISphTokenizer * pTok, CSphDict * pDict, CSphVector<SphWordID_t> & dHitlessWords, CSphString & sError )
 {
 	assert ( dHitlessWords.GetLength()==0 );
 
-	if ( m_tSettings.m_sHitlessFiles.IsEmpty() )
+	if ( sHitlessFiles.IsEmpty() )
 		return true;
 
-	const char * szStart = m_tSettings.m_sHitlessFiles.cstr();
+	StrVec_t dFiles;
+	sphSplit ( dFiles, sHitlessFiles.cstr(), ", " );
 
-	while ( *szStart )
+	for ( const CSphString & sFilename : dFiles )
 	{
-		while ( *szStart && ( sphIsSpace ( *szStart ) || *szStart==',' ) )
-			++szStart;
-
-		if ( !*szStart )
-			break;
-
-		const char * szWordStart = szStart;
-
-		while ( *szStart && !sphIsSpace ( *szStart ) && *szStart!=',' )
-			++szStart;
-
-		if ( szStart - szWordStart > 0 )
-		{
-			CSphString sFilename;
-			sFilename.SetBinary ( szWordStart, szStart-szWordStart );
-
-			CSphAutofile tFile ( sFilename.cstr(), SPH_O_READ, m_sLastError );
-			if ( tFile.GetFD()==-1 )
-				return false;
+		CSphAutofile tFile ( sFilename.cstr(), SPH_O_READ, sError );
+		if ( tFile.GetFD()==-1 )
+			return false;
 
-			CSphVector<BYTE> dBuffer ( (int)tFile.GetSize() );
-			if ( !tFile.Read ( &dBuffer[0], dBuffer.GetLength(), m_sLastError ) )
-				return false;
+		CSphVector<BYTE> dBuffer ( (int)tFile.GetSize() );
+		if ( !tFile.Read ( &dBuffer[0], dBuffer.GetLength(), sError ) )
+			return false;
 
-			// FIXME!!! dict=keywords + hitless_words=some
-			m_pTokenizer->SetBuffer ( &dBuffer[0], dBuffer.GetLength() );
-			while ( BYTE * sToken = m_pTokenizer->GetToken() )
-				dHitlessWords.Add ( m_pDict->GetWordID ( sToken ) );
-		}
+		// FIXME!!! dict=keywords + hitless_words=some
+		pTok->SetBuffer ( &dBuffer[0], dBuffer.GetLength() );
+		while ( BYTE * sToken = pTok->GetToken() )
+			dHitlessWords.Add ( pDict->GetWordID ( sToken ) );
 	}
 
 	dHitlessWords.Uniq();
@@ -11047,7 +11030,7 @@ int CSphIndex_VLN::Build ( const CSphVector<CSphSource*> & dSources, int iMemory
 
 	CSphVector<SphWordID_t> dHitlessWords;
 
-	if ( !LoadHitlessWords ( dHitlessWords ) )
+	if ( !LoadHitlessWords ( m_tSettings.m_sHitlessFiles, m_pTokenizer, m_pDict, dHitlessWords, m_sLastError ) )
 		return 0;
 
 	// vars shared between phases
@@ -13818,6 +13801,9 @@ void LoadIndexSettings ( CSphIndexSettings & tSettings, CSphReader & tReader, DW
 		tSettings.m_iSkiplistBlockSize = 128;
 	else
 		tSettings.m_iSkiplistBlockSize = (int)tReader.GetDword();
+
+	if ( uVersion>=60 )
+		tSettings.m_sHitlessFiles = tReader.GetString();
 }
 
 
@@ -16773,7 +16759,7 @@ int CSphIndex_VLN::DebugCheck ( FILE * fp )
 	if ( !pIndexChecker->OpenFiles(sError) )
 		return 1;
 
-	if ( !LoadHitlessWords ( pIndexChecker->GetHitlessWords() ) )
+	if ( !LoadHitlessWords ( m_tSettings.m_sHitlessFiles, m_pTokenizer, m_pDict, pIndexChecker->GetHitlessWords(), m_sLastError ) )
 		tReporter.Fail ( "unable to load hitless words: %s", m_sLastError.cstr() );
 
 	CSphSavedFile tStat;
@@ -20346,36 +20332,50 @@ class CRtDictKeywords final : public ISphRtDictWraper
 	CSphString				m_sWarning;
 	int						m_iKeywordsOverrun = 0;
 	CSphString				m_sWord; // For allocation reuse.
+	const bool				m_bStoreID = false;
 
 protected:
 	virtual ~CRtDictKeywords () final {} // fixme! remove
 
 public:
-	explicit CRtDictKeywords ( CSphDict * pBase )
+	explicit CRtDictKeywords ( CSphDict * pBase, bool bStoreID )
 		: m_pBase ( pBase )
+		, m_bStoreID ( bStoreID )
 	{
 		SafeAddRef ( pBase );
 		m_dPackedKeywords.Add ( 0 ); // avoid zero offset at all costs
 	}
 
 	SphWordID_t GetWordID ( BYTE * pWord ) final
 	{
-		return m_pBase->GetWordID ( pWord ) ? AddKeyword ( pWord ) : 0;
+		SphWordID_t tWordID = m_pBase->GetWordID ( pWord );
+		if ( tWordID )
+			return AddKeyword ( pWord, tWordID );
+		return 0;
 	}
 
 	SphWordID_t GetWordIDWithMarkers ( BYTE * pWord ) final
 	{
-		return m_pBase->GetWordIDWithMarkers ( pWord ) ? AddKeyword ( pWord ) : 0;
+		SphWordID_t tWordID = m_pBase->GetWordIDWithMarkers ( pWord );
+		if ( tWordID )
+			return AddKeyword ( pWord, tWordID );
+		return 0;
 	}
 
 	SphWordID_t GetWordIDNonStemmed ( BYTE * pWord ) final
 	{
-		return m_pBase->GetWordIDNonStemmed ( pWord ) ? AddKeyword ( pWord ) : 0;
+		SphWordID_t tWordID = m_pBase->GetWordIDNonStemmed ( pWord );
+		if ( tWordID )
+			return AddKeyword ( pWord, tWordID );
+		return 0;
 	}
 
 	SphWordID_t GetWordID ( const BYTE * pWord, int iLen, bool bFilterStops ) final
 	{
-		return m_pBase->GetWordID ( pWord, iLen, bFilterStops ) ? AddKeyword ( pWord ) : 0;
+		SphWordID_t tWordID = m_pBase->GetWordID ( pWord, iLen, bFilterStops );
+		if ( tWordID )
+			return AddKeyword ( pWord, tWordID );
+		return 0;
 	}
 
 	const BYTE * GetPackedKeywords () final { return m_dPackedKeywords.Begin(); }
@@ -20405,7 +20405,7 @@ class CRtDictKeywords final : public ISphRtDictWraper
 	uint64_t GetSettingsFNV () const final { return m_pBase->GetSettingsFNV(); }
 
 private:
-	SphWordID_t AddKeyword ( const BYTE * pWord )
+	SphWordID_t AddKeyword ( const BYTE * pWord, SphWordID_t tWordID )
 	{
 		int iLen = strlen ( ( const char * ) pWord );
 		// stemmer might squeeze out the word
@@ -20439,19 +20439,24 @@ class CRtDictKeywords final : public ISphRtDictWraper
 		}
 
 		int iOff = m_dPackedKeywords.GetLength ();
-		m_dPackedKeywords.Resize ( iOff + iLen + 1 );
+		int iPackedLen = iOff + iLen + 1;
+		if ( m_bStoreID )
+			iPackedLen += sizeof ( tWordID );
+		m_dPackedKeywords.Resize ( iPackedLen );
 		m_dPackedKeywords[iOff] = ( BYTE ) ( iLen & 0xFF );
 		memcpy ( m_dPackedKeywords.Begin () + iOff + 1, pWord, iLen );
+		if ( m_bStoreID )
+			memcpy ( m_dPackedKeywords.Begin () + iOff + 1 + iLen, &tWordID, sizeof(tWordID) );
 
 		m_hKeywords.Add ( iOff, m_sWord );
 
 		return iOff;
 	}
 };
 
-ISphRtDictWraper * sphCreateRtKeywordsDictionaryWrapper ( CSphDict * pBase )
+ISphRtDictWraper * sphCreateRtKeywordsDictionaryWrapper ( CSphDict * pBase, bool bStoreID )
 {
-	return new CRtDictKeywords ( pBase );
+	return new CRtDictKeywords ( pBase, bStoreID );
 }
 
 

diff --git a/src/sphinxint.h b/src/sphinxint.h
@@ -42,7 +42,7 @@ inline const char * strerrorm ( int errnum )
 //////////////////////////////////////////////////////////////////////////
 
 const DWORD		INDEX_MAGIC_HEADER			= 0x58485053;		///< my magic 'SPHX' header
-const DWORD		INDEX_FORMAT_VERSION		= 59;				///< my format version
+const DWORD		INDEX_FORMAT_VERSION		= 60;				///< my format version
 
 const char		MAGIC_SYNONYM_WHITESPACE	= 1;				// used internally in tokenizer only
 const char		MAGIC_CODE_SENTENCE			= 2;				// emitted from tokenizer on sentence boundary
@@ -1673,6 +1673,7 @@ void			ReadSchema ( CSphReader & rdInfo, CSphSchema & m_tSchema, DWORD uVersion
 void			SaveIndexSettings ( CSphWriter & tWriter, const CSphIndexSettings & tSettings );
 void			LoadIndexSettings ( CSphIndexSettings & tSettings, CSphReader & tReader, DWORD uVersion );
 bool			AddFieldLens ( CSphSchema & tSchema, bool bDynamic, CSphString & sError );
+bool			LoadHitlessWords ( const CSphString & sHitlessFiles, ISphTokenizer * pTok, CSphDict * pDict, CSphVector<SphWordID_t> & dHitlessWords, CSphString & sError );
 
 /// Get current thread local index - internal do not use
 class RtIndex_i;
@@ -1888,7 +1889,7 @@ class ISphRtDictWraper : public CSphDict
 	virtual void			ResetWarning() = 0;
 };
 
-ISphRtDictWraper * sphCreateRtKeywordsDictionaryWrapper ( CSphDict * pBase );
+ISphRtDictWraper * sphCreateRtKeywordsDictionaryWrapper ( CSphDict * pBase, bool bStoreID );
 
 struct SphExpanded_t
 {

diff --git a/src/sphinxpq.cpp b/src/sphinxpq.cpp
@@ -165,6 +165,7 @@ class PercolateIndex_c : public PercolateIndex_i
 
 	CSphFixedVector<StoredQueryDesc_t>	m_dLoadedQueries { 0 }; // temporary, just descriptions
 	CSphSchema						m_tMatchSchema;
+	CSphVector<SphWordID_t>			m_dHitlessWords;
 
 	void DoMatchDocuments ( const RtSegment_t * pSeg, PercolateMatchResult_t & tRes );
 	bool MultiScan ( const CSphQuery * pQuery, CSphQueryResult * pResult, int iSorters,
@@ -191,6 +192,8 @@ class PercolateIndex_c : public PercolateIndex_i
 	void AddToStoredUnl ( StoredQuerySharedPtr_t tNew ) REQUIRES ( m_tLockHash, m_tLock );
 	void PostSetupUnl () REQUIRES ( m_tLockHash, m_tLock  );
 	SharedPQSlice_t GetStored () const EXCLUDES ( m_tLock );
+
+	bool NeedStoreWordID () const override { return ( m_tSettings.m_eHitless==SPH_HITLESS_SOME && m_dHitlessWords.GetLength() ); }
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -207,7 +210,7 @@ PercolateIndex_i * CreateIndexPercolate ( const CSphSchema & tSchema, const char
 	return new PercolateIndex_c ( tSchema, sIndexName, sPath );
 }
 
-static SegmentReject_t SegmentGetRejects ( const RtSegment_t * pSeg, bool bBuildInfix, bool bUtf8 )
+static SegmentReject_t SegmentGetRejects ( const RtSegment_t * pSeg, bool bBuildInfix, bool bUtf8, ESphHitless eHitless )
 {
 	SegmentReject_t tReject;
 	tReject.m_iRows = pSeg->m_uRows;
@@ -227,7 +230,7 @@ static SegmentReject_t SegmentGetRejects ( const RtSegment_t * pSeg, bool bBuild
 		tReject.m_dWilds.Fill ( 0 );
 	}
 
-	RtWordReader_t tDict ( pSeg, true, PERCOLATE_WORDS_PER_CP );
+	RtWordReader_t tDict ( pSeg, true, PERCOLATE_WORDS_PER_CP, eHitless );
 	const RtWord_t * pWord = nullptr;
 	BloomGenTraits_t tBloom0 ( tReject.m_dWilds.Begin() );
 	BloomGenTraits_t tBloom1 ( tReject.m_dWilds.Begin() + PERCOLATE_BLOOM_WILD_COUNT );
@@ -947,7 +950,7 @@ bool PercolateQwordSetup_c::QwordSetup ( ISphQword * pQword ) const
 	CSphVector<Slice_t> dDictWords;
 	ARRAY_FOREACH ( i, dDictLoc )
 	{
-		RtWordReader_t tReader ( m_pSeg, true, PERCOLATE_WORDS_PER_CP );
+		RtWordReader_t tReader ( m_pSeg, true, PERCOLATE_WORDS_PER_CP, m_eHitless );
 		// locator
 		// m_uOff - Start
 		// m_uLen - End
@@ -1034,10 +1037,10 @@ SphWordID_t DictMap_t::GetTerm ( BYTE * sWord ) const
 	return pTerm->m_uWordID;
 }
 
-PercolateMatchContext_t * PercolateIndex_c::CreateMatchContext ( const RtSegment_t * pSeg, const SegmentReject_t &tReject )
+PercolateMatchContext_t * PercolateIndex_c::CreateMatchContext ( const RtSegment_t * pSeg, const SegmentReject_t & tReject )
 {
 	return new PercolateMatchContext_t ( pSeg, m_iMaxCodepointLength, m_pDict->HasMorphology(), GetStatelessDict ( m_pDict ), this
-												   , m_tSchema, tReject );
+												   , m_tSchema, tReject, m_tSettings.m_eHitless );
 }
 
 // percolate matching
@@ -1382,7 +1385,7 @@ void PercolateIndex_c::DoMatchDocuments ( const RtSegment_t * pSeg, PercolateMat
 {
 	// reject need bloom filter for either infix or prefix
 	auto tReject = SegmentGetRejects (
-		  pSeg, ( m_tSettings.m_iMinInfixLen>0 || m_tSettings.GetMinPrefixLen ( m_pDict->GetSettings().m_bWordDict )>0 ), m_iMaxCodepointLength>1 );
+		  pSeg, ( m_tSettings.m_iMinInfixLen>0 || m_tSettings.GetMinPrefixLen ( m_pDict->GetSettings().m_bWordDict )>0 ), m_iMaxCodepointLength>1, m_tSettings.m_eHitless );
 
 	CSphAtomic iCurQuery;
 	CSphFixedVector<PercolateMatchContext_t *> dResults ( 1 );
@@ -1462,11 +1465,11 @@ bool PercolateIndex_c::MatchDocuments ( RtAccum_t * pAccExt, PercolateMatchResul
 
 	pAcc->Sort();
 
-	RtSegment_t * pSeg = pAcc->CreateSegment ( m_tSchema.GetRowSize(), PERCOLATE_WORDS_PER_CP );
+	RtSegment_t * pSeg = pAcc->CreateSegment ( m_tSchema.GetRowSize(), PERCOLATE_WORDS_PER_CP, m_tSettings.m_eHitless, m_dHitlessWords );
 	assert ( !pSeg || pSeg->m_uRows>0 );
 	assert ( !pSeg || pSeg->m_tAliveRows>0 );
 	BuildSegmentInfixes ( pSeg, m_pDict->HasMorphology(), true, m_tSettings.m_iMinInfixLen,
-		PERCOLATE_WORDS_PER_CP, ( m_iMaxCodepointLength>1 ) );
+		PERCOLATE_WORDS_PER_CP, ( m_iMaxCodepointLength>1 ), m_tSettings.m_eHitless );
 
 	DoMatchDocuments ( pSeg, tRes );
 	SafeRelease ( pSeg );
@@ -2059,6 +2062,10 @@ void PercolateIndex_c::PostSetupUnl()
 	if ( m_tSettings.m_bIndexExactWords )
 		SetupExactDict ( pDict, pTokenizer );
 
+	// hitless
+	if ( !LoadHitlessWords ( m_tSettings.m_sHitlessFiles, m_pTokenizerIndexing, m_pDict, m_dHitlessWords, m_sLastError ) )
+		sphWarning ( "index '%s': %s", m_sIndexName.cstr(), m_sLastError.cstr() );
+
 	m_pQueries->ReserveGap( m_dLoadedQueries.GetLength () );
 
 	CSphString sError;