Skip to content

Commit

Permalink
fixed #1355 hitless_words for RT and PQ indexes; set index version to…
Browse files Browse the repository at this point in the history
… 60; added hitless_words to SHOW INDEx SETTINGS statement; fixed RT index to save disk chunk in version 58 compatible format; added regressions to test 408
  • Loading branch information
tomatolog committed May 21, 2020
1 parent 7cd878f commit d38409e
Show file tree
Hide file tree
Showing 14 changed files with 452 additions and 90 deletions.
4 changes: 2 additions & 2 deletions src/accumulator.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ class RtAccum_t
CSphVector<ReplicationCommand_t *> m_dCmd;

bool m_bKeywordDict {true};
DictRefPtr_c m_pDict;
DictRefPtr_c m_pDict;
CSphDict * m_pRefDict = nullptr; // not owned, used only for ==-matching


Expand All @@ -109,7 +109,7 @@ class RtAccum_t
void Cleanup();

void AddDocument ( ISphHits * pHits, const CSphMatch & tDoc, bool bReplace, int iRowSize, const char ** ppStr, const VecTraits_T<int64_t> & dMvas, const DocstoreBuilder_i::Doc_t * pStoredDoc );
RtSegment_t * CreateSegment ( int iRowSize, int iWordsCheckpoint );
RtSegment_t * CreateSegment ( int iRowSize, int iWordsCheckpoint, ESphHitless eHitless, const VecTraits_T<SphWordID_t> & dHitlessWords );
void CleanupDuplicates ( int iRowSize );
void GrabLastWarning ( CSphString & sWarning );
void SetIndex ( RtIndex_i * pIndex );
Expand Down
27 changes: 27 additions & 0 deletions src/indexsettings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -854,6 +854,15 @@ void CSphIndexSettings::Format ( SettingsFormatter_c & tOut, FilenameBuilder_i *
tOut.Add ( "index_token_filter", m_sIndexTokenFilter, !m_sIndexTokenFilter.IsEmpty() );
tOut.Add ( "attr_update_reserve", m_tBlobUpdateSpace, m_tBlobUpdateSpace!=DEFAULT_ATTR_UPDATE_RESERVE );

if ( m_eHitless==SPH_HITLESS_ALL )
{
tOut.Add ( "hitless_words", "all", true );
} else if ( m_eHitless==SPH_HITLESS_SOME )
{
CSphString sHitlessFiles = FormatPath ( m_sHitlessFiles, pFilenameBuilder );
tOut.Add ( "hitless_words", sHitlessFiles, true );
}

DocstoreSettings_t::Format ( tOut, pFilenameBuilder );
}

Expand Down Expand Up @@ -948,6 +957,21 @@ bool IndexSettingsContainer_c::AddOption ( const CSphString & sName, const CSphS
return true;
}

if ( sName=="hitless_words" && ( sValue!="none" && sValue!="all" ) )
{
RemoveKeys ( sName );
m_dHitlessFiles.Reset();
StrVec_t dValues = SplitArg ( sValue, m_dHitlessFiles );

// need only names for hitless files
StringBuilder_c sTmp ( " " );
for ( const CSphString & sVal : dValues )
sTmp << sVal;

return Add ( sName, sTmp.cstr() );

}

return Add ( sName, sValue );
}

Expand Down Expand Up @@ -1059,6 +1083,9 @@ StrVec_t IndexSettingsContainer_c::GetFiles() const
dFiles.Add(j);
}

for ( const auto & i : m_dHitlessFiles )
dFiles.Add ( i );

return dFiles;
}

Expand Down
1 change: 1 addition & 0 deletions src/indexsettings.h
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,7 @@ class IndexSettingsContainer_c
StrVec_t m_dStopwordFiles;
StrVec_t m_dExceptionFiles;
StrVec_t m_dWordformFiles;
StrVec_t m_dHitlessFiles;
CSphString m_sError;
};

Expand Down
93 changes: 49 additions & 44 deletions src/sphinx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1984,8 +1984,6 @@ class CSphIndex_VLN : public CSphIndex, public IndexUpdateHelper_c, public Index
private:
mutable CSphIndexProgress m_tProgress;

bool LoadHitlessWords ( CSphVector<SphWordID_t> & dHitlessWords );

private:
int64_t m_iDocinfo; ///< my docinfo cache size
int64_t m_iDocinfoIndex; ///< docinfo "index" entries count (each entry is 2x docinfo rows, for min/max)
Expand Down Expand Up @@ -9773,6 +9771,7 @@ void SaveIndexSettings ( CSphWriter & tWriter, const CSphIndexSettings & tSettin
tWriter.PutString ( tSettings.m_sIndexTokenFilter );
tWriter.PutOffset ( tSettings.m_tBlobUpdateSpace );
tWriter.PutDword ( tSettings.m_iSkiplistBlockSize );
tWriter.PutString ( tSettings.m_sHitlessFiles );
}


Expand Down Expand Up @@ -10277,46 +10276,30 @@ bool CSphIndex_VLN::RelocateBlock ( int iFile, BYTE * pBuffer, int iRelocationSi
}


bool CSphIndex_VLN::LoadHitlessWords ( CSphVector<SphWordID_t> & dHitlessWords )
bool LoadHitlessWords ( const CSphString & sHitlessFiles, ISphTokenizer * pTok, CSphDict * pDict, CSphVector<SphWordID_t> & dHitlessWords, CSphString & sError )
{
assert ( dHitlessWords.GetLength()==0 );

if ( m_tSettings.m_sHitlessFiles.IsEmpty() )
if ( sHitlessFiles.IsEmpty() )
return true;

const char * szStart = m_tSettings.m_sHitlessFiles.cstr();
StrVec_t dFiles;
sphSplit ( dFiles, sHitlessFiles.cstr(), ", " );

while ( *szStart )
for ( const CSphString & sFilename : dFiles )
{
while ( *szStart && ( sphIsSpace ( *szStart ) || *szStart==',' ) )
++szStart;

if ( !*szStart )
break;

const char * szWordStart = szStart;

while ( *szStart && !sphIsSpace ( *szStart ) && *szStart!=',' )
++szStart;

if ( szStart - szWordStart > 0 )
{
CSphString sFilename;
sFilename.SetBinary ( szWordStart, szStart-szWordStart );

CSphAutofile tFile ( sFilename.cstr(), SPH_O_READ, m_sLastError );
if ( tFile.GetFD()==-1 )
return false;
CSphAutofile tFile ( sFilename.cstr(), SPH_O_READ, sError );
if ( tFile.GetFD()==-1 )
return false;

CSphVector<BYTE> dBuffer ( (int)tFile.GetSize() );
if ( !tFile.Read ( &dBuffer[0], dBuffer.GetLength(), m_sLastError ) )
return false;
CSphVector<BYTE> dBuffer ( (int)tFile.GetSize() );
if ( !tFile.Read ( &dBuffer[0], dBuffer.GetLength(), sError ) )
return false;

// FIXME!!! dict=keywords + hitless_words=some
m_pTokenizer->SetBuffer ( &dBuffer[0], dBuffer.GetLength() );
while ( BYTE * sToken = m_pTokenizer->GetToken() )
dHitlessWords.Add ( m_pDict->GetWordID ( sToken ) );
}
// FIXME!!! dict=keywords + hitless_words=some
pTok->SetBuffer ( &dBuffer[0], dBuffer.GetLength() );
while ( BYTE * sToken = pTok->GetToken() )
dHitlessWords.Add ( pDict->GetWordID ( sToken ) );
}

dHitlessWords.Uniq();
Expand Down Expand Up @@ -11047,7 +11030,7 @@ int CSphIndex_VLN::Build ( const CSphVector<CSphSource*> & dSources, int iMemory

CSphVector<SphWordID_t> dHitlessWords;

if ( !LoadHitlessWords ( dHitlessWords ) )
if ( !LoadHitlessWords ( m_tSettings.m_sHitlessFiles, m_pTokenizer, m_pDict, dHitlessWords, m_sLastError ) )
return 0;

// vars shared between phases
Expand Down Expand Up @@ -13818,6 +13801,9 @@ void LoadIndexSettings ( CSphIndexSettings & tSettings, CSphReader & tReader, DW
tSettings.m_iSkiplistBlockSize = 128;
else
tSettings.m_iSkiplistBlockSize = (int)tReader.GetDword();

if ( uVersion>=60 )
tSettings.m_sHitlessFiles = tReader.GetString();
}


Expand Down Expand Up @@ -16773,7 +16759,7 @@ int CSphIndex_VLN::DebugCheck ( FILE * fp )
if ( !pIndexChecker->OpenFiles(sError) )
return 1;

if ( !LoadHitlessWords ( pIndexChecker->GetHitlessWords() ) )
if ( !LoadHitlessWords ( m_tSettings.m_sHitlessFiles, m_pTokenizer, m_pDict, pIndexChecker->GetHitlessWords(), m_sLastError ) )
tReporter.Fail ( "unable to load hitless words: %s", m_sLastError.cstr() );

CSphSavedFile tStat;
Expand Down Expand Up @@ -20346,36 +20332,50 @@ class CRtDictKeywords final : public ISphRtDictWraper
CSphString m_sWarning;
int m_iKeywordsOverrun = 0;
CSphString m_sWord; // For allocation reuse.
const bool m_bStoreID = false;

protected:
virtual ~CRtDictKeywords () final {} // fixme! remove

public:
explicit CRtDictKeywords ( CSphDict * pBase )
explicit CRtDictKeywords ( CSphDict * pBase, bool bStoreID )
: m_pBase ( pBase )
, m_bStoreID ( bStoreID )
{
SafeAddRef ( pBase );
m_dPackedKeywords.Add ( 0 ); // avoid zero offset at all costs
}

SphWordID_t GetWordID ( BYTE * pWord ) final
{
return m_pBase->GetWordID ( pWord ) ? AddKeyword ( pWord ) : 0;
SphWordID_t tWordID = m_pBase->GetWordID ( pWord );
if ( tWordID )
return AddKeyword ( pWord, tWordID );
return 0;
}

SphWordID_t GetWordIDWithMarkers ( BYTE * pWord ) final
{
return m_pBase->GetWordIDWithMarkers ( pWord ) ? AddKeyword ( pWord ) : 0;
SphWordID_t tWordID = m_pBase->GetWordIDWithMarkers ( pWord );
if ( tWordID )
return AddKeyword ( pWord, tWordID );
return 0;
}

SphWordID_t GetWordIDNonStemmed ( BYTE * pWord ) final
{
return m_pBase->GetWordIDNonStemmed ( pWord ) ? AddKeyword ( pWord ) : 0;
SphWordID_t tWordID = m_pBase->GetWordIDNonStemmed ( pWord );
if ( tWordID )
return AddKeyword ( pWord, tWordID );
return 0;
}

SphWordID_t GetWordID ( const BYTE * pWord, int iLen, bool bFilterStops ) final
{
return m_pBase->GetWordID ( pWord, iLen, bFilterStops ) ? AddKeyword ( pWord ) : 0;
SphWordID_t tWordID = m_pBase->GetWordID ( pWord, iLen, bFilterStops );
if ( tWordID )
return AddKeyword ( pWord, tWordID );
return 0;
}

const BYTE * GetPackedKeywords () final { return m_dPackedKeywords.Begin(); }
Expand Down Expand Up @@ -20405,7 +20405,7 @@ class CRtDictKeywords final : public ISphRtDictWraper
uint64_t GetSettingsFNV () const final { return m_pBase->GetSettingsFNV(); }

private:
SphWordID_t AddKeyword ( const BYTE * pWord )
SphWordID_t AddKeyword ( const BYTE * pWord, SphWordID_t tWordID )
{
int iLen = strlen ( ( const char * ) pWord );
// stemmer might squeeze out the word
Expand Down Expand Up @@ -20439,19 +20439,24 @@ class CRtDictKeywords final : public ISphRtDictWraper
}

int iOff = m_dPackedKeywords.GetLength ();
m_dPackedKeywords.Resize ( iOff + iLen + 1 );
int iPackedLen = iOff + iLen + 1;
if ( m_bStoreID )
iPackedLen += sizeof ( tWordID );
m_dPackedKeywords.Resize ( iPackedLen );
m_dPackedKeywords[iOff] = ( BYTE ) ( iLen & 0xFF );
memcpy ( m_dPackedKeywords.Begin () + iOff + 1, pWord, iLen );
if ( m_bStoreID )
memcpy ( m_dPackedKeywords.Begin () + iOff + 1 + iLen, &tWordID, sizeof(tWordID) );

m_hKeywords.Add ( iOff, m_sWord );

return iOff;
}
};

ISphRtDictWraper * sphCreateRtKeywordsDictionaryWrapper ( CSphDict * pBase )
ISphRtDictWraper * sphCreateRtKeywordsDictionaryWrapper ( CSphDict * pBase, bool bStoreID )
{
return new CRtDictKeywords ( pBase );
return new CRtDictKeywords ( pBase, bStoreID );
}


Expand Down
5 changes: 3 additions & 2 deletions src/sphinxint.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ inline const char * strerrorm ( int errnum )
//////////////////////////////////////////////////////////////////////////

const DWORD INDEX_MAGIC_HEADER = 0x58485053; ///< my magic 'SPHX' header
const DWORD INDEX_FORMAT_VERSION = 59; ///< my format version
const DWORD INDEX_FORMAT_VERSION = 60; ///< my format version

const char MAGIC_SYNONYM_WHITESPACE = 1; // used internally in tokenizer only
const char MAGIC_CODE_SENTENCE = 2; // emitted from tokenizer on sentence boundary
Expand Down Expand Up @@ -1673,6 +1673,7 @@ void ReadSchema ( CSphReader & rdInfo, CSphSchema & m_tSchema, DWORD uVersion
void SaveIndexSettings ( CSphWriter & tWriter, const CSphIndexSettings & tSettings );
void LoadIndexSettings ( CSphIndexSettings & tSettings, CSphReader & tReader, DWORD uVersion );
bool AddFieldLens ( CSphSchema & tSchema, bool bDynamic, CSphString & sError );
bool LoadHitlessWords ( const CSphString & sHitlessFiles, ISphTokenizer * pTok, CSphDict * pDict, CSphVector<SphWordID_t> & dHitlessWords, CSphString & sError );

/// Get current thread local index - internal do not use
class RtIndex_i;
Expand Down Expand Up @@ -1888,7 +1889,7 @@ class ISphRtDictWraper : public CSphDict
virtual void ResetWarning() = 0;
};

ISphRtDictWraper * sphCreateRtKeywordsDictionaryWrapper ( CSphDict * pBase );
ISphRtDictWraper * sphCreateRtKeywordsDictionaryWrapper ( CSphDict * pBase, bool bStoreID );

struct SphExpanded_t
{
Expand Down
23 changes: 15 additions & 8 deletions src/sphinxpq.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ class PercolateIndex_c : public PercolateIndex_i

CSphFixedVector<StoredQueryDesc_t> m_dLoadedQueries { 0 }; // temporary, just descriptions
CSphSchema m_tMatchSchema;
CSphVector<SphWordID_t> m_dHitlessWords;

void DoMatchDocuments ( const RtSegment_t * pSeg, PercolateMatchResult_t & tRes );
bool MultiScan ( const CSphQuery * pQuery, CSphQueryResult * pResult, int iSorters,
Expand All @@ -191,6 +192,8 @@ class PercolateIndex_c : public PercolateIndex_i
void AddToStoredUnl ( StoredQuerySharedPtr_t tNew ) REQUIRES ( m_tLockHash, m_tLock );
void PostSetupUnl () REQUIRES ( m_tLockHash, m_tLock );
SharedPQSlice_t GetStored () const EXCLUDES ( m_tLock );

bool NeedStoreWordID () const override { return ( m_tSettings.m_eHitless==SPH_HITLESS_SOME && m_dHitlessWords.GetLength() ); }
};

//////////////////////////////////////////////////////////////////////////
Expand All @@ -207,7 +210,7 @@ PercolateIndex_i * CreateIndexPercolate ( const CSphSchema & tSchema, const char
return new PercolateIndex_c ( tSchema, sIndexName, sPath );
}

static SegmentReject_t SegmentGetRejects ( const RtSegment_t * pSeg, bool bBuildInfix, bool bUtf8 )
static SegmentReject_t SegmentGetRejects ( const RtSegment_t * pSeg, bool bBuildInfix, bool bUtf8, ESphHitless eHitless )
{
SegmentReject_t tReject;
tReject.m_iRows = pSeg->m_uRows;
Expand All @@ -227,7 +230,7 @@ static SegmentReject_t SegmentGetRejects ( const RtSegment_t * pSeg, bool bBuild
tReject.m_dWilds.Fill ( 0 );
}

RtWordReader_t tDict ( pSeg, true, PERCOLATE_WORDS_PER_CP );
RtWordReader_t tDict ( pSeg, true, PERCOLATE_WORDS_PER_CP, eHitless );
const RtWord_t * pWord = nullptr;
BloomGenTraits_t tBloom0 ( tReject.m_dWilds.Begin() );
BloomGenTraits_t tBloom1 ( tReject.m_dWilds.Begin() + PERCOLATE_BLOOM_WILD_COUNT );
Expand Down Expand Up @@ -947,7 +950,7 @@ bool PercolateQwordSetup_c::QwordSetup ( ISphQword * pQword ) const
CSphVector<Slice_t> dDictWords;
ARRAY_FOREACH ( i, dDictLoc )
{
RtWordReader_t tReader ( m_pSeg, true, PERCOLATE_WORDS_PER_CP );
RtWordReader_t tReader ( m_pSeg, true, PERCOLATE_WORDS_PER_CP, m_eHitless );
// locator
// m_uOff - Start
// m_uLen - End
Expand Down Expand Up @@ -1034,10 +1037,10 @@ SphWordID_t DictMap_t::GetTerm ( BYTE * sWord ) const
return pTerm->m_uWordID;
}

PercolateMatchContext_t * PercolateIndex_c::CreateMatchContext ( const RtSegment_t * pSeg, const SegmentReject_t &tReject )
PercolateMatchContext_t * PercolateIndex_c::CreateMatchContext ( const RtSegment_t * pSeg, const SegmentReject_t & tReject )
{
return new PercolateMatchContext_t ( pSeg, m_iMaxCodepointLength, m_pDict->HasMorphology(), GetStatelessDict ( m_pDict ), this
, m_tSchema, tReject );
, m_tSchema, tReject, m_tSettings.m_eHitless );
}

// percolate matching
Expand Down Expand Up @@ -1382,7 +1385,7 @@ void PercolateIndex_c::DoMatchDocuments ( const RtSegment_t * pSeg, PercolateMat
{
// reject need bloom filter for either infix or prefix
auto tReject = SegmentGetRejects (
pSeg, ( m_tSettings.m_iMinInfixLen>0 || m_tSettings.GetMinPrefixLen ( m_pDict->GetSettings().m_bWordDict )>0 ), m_iMaxCodepointLength>1 );
pSeg, ( m_tSettings.m_iMinInfixLen>0 || m_tSettings.GetMinPrefixLen ( m_pDict->GetSettings().m_bWordDict )>0 ), m_iMaxCodepointLength>1, m_tSettings.m_eHitless );

CSphAtomic iCurQuery;
CSphFixedVector<PercolateMatchContext_t *> dResults ( 1 );
Expand Down Expand Up @@ -1462,11 +1465,11 @@ bool PercolateIndex_c::MatchDocuments ( RtAccum_t * pAccExt, PercolateMatchResul

pAcc->Sort();

RtSegment_t * pSeg = pAcc->CreateSegment ( m_tSchema.GetRowSize(), PERCOLATE_WORDS_PER_CP );
RtSegment_t * pSeg = pAcc->CreateSegment ( m_tSchema.GetRowSize(), PERCOLATE_WORDS_PER_CP, m_tSettings.m_eHitless, m_dHitlessWords );
assert ( !pSeg || pSeg->m_uRows>0 );
assert ( !pSeg || pSeg->m_tAliveRows>0 );
BuildSegmentInfixes ( pSeg, m_pDict->HasMorphology(), true, m_tSettings.m_iMinInfixLen,
PERCOLATE_WORDS_PER_CP, ( m_iMaxCodepointLength>1 ) );
PERCOLATE_WORDS_PER_CP, ( m_iMaxCodepointLength>1 ), m_tSettings.m_eHitless );

DoMatchDocuments ( pSeg, tRes );
SafeRelease ( pSeg );
Expand Down Expand Up @@ -2059,6 +2062,10 @@ void PercolateIndex_c::PostSetupUnl()
if ( m_tSettings.m_bIndexExactWords )
SetupExactDict ( pDict, pTokenizer );

// hitless
if ( !LoadHitlessWords ( m_tSettings.m_sHitlessFiles, m_pTokenizerIndexing, m_pDict, m_dHitlessWords, m_sLastError ) )
sphWarning ( "index '%s': %s", m_sIndexName.cstr(), m_sLastError.cstr() );

m_pQueries->ReserveGap( m_dLoadedQueries.GetLength () );

CSphString sError;
Expand Down
Loading

0 comments on commit d38409e

Please sign in to comment.