Skip to content

Commit

Permalink
fixed count distinct vs json
Browse files Browse the repository at this point in the history
  • Loading branch information
glookka committed Jul 18, 2020
1 parent 17b30f7 commit 9bc5c01
Show file tree
Hide file tree
Showing 3 changed files with 145 additions and 88 deletions.
198 changes: 110 additions & 88 deletions src/sphinxsort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2296,8 +2296,7 @@ static void AddGroupedMVA ( ADDER && fnAdd, const ByteBlob_t& dRawMVA )
}

template <typename ADDER>
static void AddDistinctKeys ( const CSphMatch & tEntry, CSphAttrLocator & tDistinctLoc,
ESphAttr eDistinctAttr, const BYTE * pBlobPool, ADDER&& fnAdd )
static void AddDistinctKeys ( const CSphMatch & tEntry, CSphAttrLocator & tDistinctLoc, ESphAttr eDistinctAttr, const BYTE * pBlobPool, ADDER && fnAdd )
{
switch ( eDistinctAttr )
{
Expand All @@ -2310,6 +2309,17 @@ static void AddDistinctKeys ( const CSphMatch & tEntry, CSphAttrLocator & tDisti
}
break;

case SPH_ATTR_JSON_FIELD:
PushJsonField ( tEntry.GetAttr(tDistinctLoc), pBlobPool, [fnAdd]( SphAttr_t * pAttr, SphGroupKey_t uGroupKey )
{
if ( uGroupKey )
fnAdd(uGroupKey);

return true;
}
);
break;

case SPH_ATTR_UINT32SET:
case SPH_ATTR_UINT32SET_PTR:
AddGroupedMVA<DWORD> ( fnAdd, tEntry.FetchAttrData ( tDistinctLoc, pBlobPool ) );
Expand Down Expand Up @@ -3893,6 +3903,98 @@ struct MvaNGroupSorter_c : public MVAGroupSorter_T < CSphKBufferNGroupSorter < C
};


template <typename PUSH>
bool PushJsonField ( int64_t iValue, const BYTE * pBlobPool, PUSH && fnPush )
{
int iLen;
char szBuf[32];
SphGroupKey_t uGroupKey;

ESphJsonType eJson = sphJsonUnpackType ( iValue );
const BYTE * pValue = pBlobPool + sphJsonUnpackOffset ( iValue );

switch ( eJson )
{
case JSON_ROOT:
{
iLen = sphJsonNodeSize ( JSON_ROOT, pValue );
bool bEmpty = iLen==5; // mask and JSON_EOF
uGroupKey = bEmpty ? 0 : sphFNV64 ( pValue, iLen );
return fnPush ( bEmpty ? nullptr : &iValue, uGroupKey );
}

case JSON_STRING:
case JSON_OBJECT:
case JSON_MIXED_VECTOR:
iLen = sphJsonUnpackInt ( &pValue );
uGroupKey = ( iLen==1 && eJson!=JSON_STRING ) ? 0 : sphFNV64 ( pValue, iLen );
return fnPush ( ( iLen==1 && eJson!=JSON_STRING ) ? nullptr : &iValue, uGroupKey );

case JSON_STRING_VECTOR:
{
bool bRes = false;
sphJsonUnpackInt ( &pValue );
iLen = sphJsonUnpackInt ( &pValue );
for ( int i=0;i<iLen;i++ )
{
int64_t iNewValue = sphJsonPackTypeOffset ( JSON_STRING, pValue-pBlobPool );

int iStrLen = sphJsonUnpackInt ( &pValue );
uGroupKey = sphFNV64 ( pValue, iStrLen );
bRes |= fnPush ( &iNewValue, uGroupKey );
pValue += iStrLen;
}
return bRes;
}

case JSON_INT32:
return fnPush ( &iValue, sphFNV64 ( (BYTE*)FormatInt ( szBuf, (int)sphGetDword(pValue) ) ) );

case JSON_INT64:
return fnPush ( &iValue, sphFNV64 ( (BYTE*)FormatInt ( szBuf, (int)sphJsonLoadBigint ( &pValue ) ) ) );

case JSON_DOUBLE:
snprintf ( szBuf, sizeof(szBuf), "%f", sphQW2D ( sphJsonLoadBigint ( &pValue ) ) );
return fnPush ( &iValue, sphFNV64 ( (const BYTE*)szBuf ) );

case JSON_INT32_VECTOR:
{
bool bRes = false;
iLen = sphJsonUnpackInt ( &pValue );
auto p = (int*)pValue;
for ( int i=0;i<iLen;i++ )
{
int64_t iPacked = sphJsonPackTypeOffset ( JSON_INT32, (BYTE*)p-pBlobPool );
uGroupKey = *p++;
bRes |= fnPush ( &iPacked, uGroupKey );
}
return bRes;
}

case JSON_INT64_VECTOR:
case JSON_DOUBLE_VECTOR:
{
bool bRes = false;
iLen = sphJsonUnpackInt ( &pValue );
auto p = (int64_t*)pValue;
ESphJsonType eType = eJson==JSON_INT64_VECTOR ? JSON_INT64 : JSON_DOUBLE;
for ( int i=0;i<iLen;i++ )
{
int64_t iPacked = sphJsonPackTypeOffset ( eType, (BYTE*)p-pBlobPool );
uGroupKey = *p++;
bRes |= fnPush ( &iPacked, uGroupKey );
}
return bRes;
}

default:
uGroupKey = 0;
iValue = 0;
return fnPush ( &iValue, uGroupKey );
}
}


/// match sorter with k-buffering and group-by for JSON arrays
template < typename COMPGROUP, bool DISTINCT, bool NOTIFICATIONS >
class CSphKBufferJsonGroupSorter : public CSphKBufferGroupSorter < COMPGROUP, DISTINCT, NOTIFICATIONS >
Expand All @@ -3914,97 +4016,17 @@ class CSphKBufferJsonGroupSorter : public CSphKBufferGroupSorter < COMPGROUP, DI
/// add entry to the queue
bool Push ( const CSphMatch & tMatch ) override
{
bool bRes = false;
SphGroupKey_t uGroupKey = this->m_pGrouper->KeyFromMatch ( tMatch );

int iLen;
char sBuf[32];

SphGroupKey_t uGroupkey = this->m_pGrouper->KeyFromMatch ( tMatch );

auto iValue = (int64_t)uGroupkey;
CSphGrouper* pGrouper = this->m_pGrouper;
auto iValue = (int64_t)uGroupKey;
CSphGrouper * pGrouper = this->m_pGrouper;
const BYTE * pBlobPool = ((CSphGrouperJsonField*)pGrouper)->GetBlobPool();

ESphJsonType eJson = sphJsonUnpackType ( iValue );
const BYTE * pValue = pBlobPool + sphJsonUnpackOffset ( iValue );

switch ( eJson )
{
case JSON_ROOT:
{
iLen = sphJsonNodeSize ( JSON_ROOT, pValue );
bool bEmpty = iLen==5; // mask and JSON_EOF
uGroupkey = bEmpty ? 0 : sphFNV64 ( pValue, iLen );
return this->PushEx ( tMatch, uGroupkey, false, false, bEmpty ? nullptr : &iValue );
}
case JSON_STRING:
case JSON_OBJECT:
case JSON_MIXED_VECTOR:
iLen = sphJsonUnpackInt ( &pValue );
uGroupkey = ( iLen==1 && eJson!=JSON_STRING ) ? 0 : sphFNV64 ( pValue, iLen );
return this->PushEx ( tMatch, uGroupkey, false, false, ( iLen==1 && eJson!=JSON_STRING ) ? 0: &iValue );

case JSON_STRING_VECTOR:
{
sphJsonUnpackInt ( &pValue );
iLen = sphJsonUnpackInt ( &pValue );
for ( int i=0;i<iLen;i++ )
{
int64_t iNewValue = sphJsonPackTypeOffset ( JSON_STRING, pValue-pBlobPool );

int iStrLen = sphJsonUnpackInt ( &pValue );
uGroupkey = sphFNV64 ( pValue, iStrLen );
bRes |= this->PushEx ( tMatch, uGroupkey, false, false, &iNewValue );
pValue += iStrLen;
}
return bRes;
}
case JSON_INT32:
uGroupkey = sphFNV64 ( (BYTE*)FormatInt ( sBuf, (int)sphGetDword(pValue) ) );
break;
case JSON_INT64:
uGroupkey = sphFNV64 ( (BYTE*)FormatInt ( sBuf, (int)sphJsonLoadBigint ( &pValue ) ) );
break;
case JSON_DOUBLE:
snprintf ( sBuf, sizeof(sBuf), "%f", sphQW2D ( sphJsonLoadBigint ( &pValue ) ) );
uGroupkey = sphFNV64 ( (const BYTE*)sBuf );
break;
case JSON_INT32_VECTOR:
{
iLen = sphJsonUnpackInt ( &pValue );
auto p = (int*)pValue;
for ( int i=0;i<iLen;i++ )
{
int64_t iPacked = sphJsonPackTypeOffset ( JSON_INT32, (BYTE*)p-pBlobPool );
uGroupkey = *p++;
bRes |= this->PushEx ( tMatch, uGroupkey, false, false, &iPacked );
}
return bRes;
}
break;
case JSON_INT64_VECTOR:
case JSON_DOUBLE_VECTOR:
return PushJsonField ( iValue, pBlobPool, [this, &tMatch]( SphAttr_t * pAttr, SphGroupKey_t uGroupKey )
{
iLen = sphJsonUnpackInt ( &pValue );
auto p = (int64_t*)pValue;
ESphJsonType eType = eJson==JSON_INT64_VECTOR ? JSON_INT64 : JSON_DOUBLE;
for ( int i=0;i<iLen;i++ )
{
int64_t iPacked = sphJsonPackTypeOffset ( eType, (BYTE*)p-pBlobPool );
uGroupkey = *p++;
bRes |= this->PushEx ( tMatch, uGroupkey, false, false, &iPacked );
}
return bRes;
return this->PushEx ( tMatch, uGroupKey, false, false, pAttr );
}
break;
default:
uGroupkey = 0;
iValue = 0;
break;
}

bRes |= this->PushEx ( tMatch, uGroupkey, false, false, &iValue );
return bRes;
);
}

/// add pre-grouped entry to the queue
Expand Down
1 change: 1 addition & 0 deletions test/test_412/model.bin
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
a:1:{i:0;a:7:{i:0;a:2:{s:8:"sphinxql";s:30:"create table t(f text, j json)";s:14:"total_affected";i:0;}i:1;a:2:{s:8:"sphinxql";s:40:"insert into t(id,j) values(1,'{"id":6}')";s:14:"total_affected";i:1;}i:2;a:2:{s:8:"sphinxql";s:40:"insert into t(id,j) values(2,'{"id":7}')";s:14:"total_affected";i:1;}i:3;a:3:{s:8:"sphinxql";s:39:"select j.id i, count(distinct i) from t";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:2:{s:1:"i";s:1:"7";s:17:"count(distinct i)";s:1:"2";}}}i:4;a:2:{s:8:"sphinxql";s:16:"flush ramchunk t";s:14:"total_affected";i:0;}i:5;a:3:{s:8:"sphinxql";s:39:"select j.id i, count(distinct i) from t";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:2:{s:1:"i";s:1:"7";s:17:"count(distinct i)";s:1:"2";}}}i:6;a:2:{s:8:"sphinxql";s:12:"drop table t";s:14:"total_affected";i:0;}}}
34 changes: 34 additions & 0 deletions test/test_412/test.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<?xml version="1.0" encoding="utf-8"?>
<test>

<name>json vs count distinct</name>
<skip_indexer/>

<config>
indexer
{
mem_limit = 16M
}

searchd
{
<searchd_Settings/>
data_dir = <data_path path="data0"/>
}
</config>

<sphqueries>

<sphinxql>create table t(f text, j json)</sphinxql>
<sphinxql>insert into t(id,j) values(1,'{"id":6}')</sphinxql>
<sphinxql>insert into t(id,j) values(2,'{"id":7}')</sphinxql>
<sphinxql>select j.id i, count(distinct i) from t</sphinxql>

<sphinxql>flush ramchunk t</sphinxql>
<sphinxql>select j.id i, count(distinct i) from t</sphinxql>

<sphinxql>drop table t</sphinxql>

</sphqueries>

</test>

0 comments on commit 9bc5c01

Please sign in to comment.