fixed count distinct vs json

manticoresoftware · Jul 18, 2020 · 9bc5c01 · 9bc5c01
1 parent 17b30f7
commit 9bc5c01
Show file tree

Hide file tree

Showing 3 changed files with 145 additions and 88 deletions.
diff --git a/src/sphinxsort.cpp b/src/sphinxsort.cpp
@@ -2296,8 +2296,7 @@ static void AddGroupedMVA ( ADDER && fnAdd, const ByteBlob_t& dRawMVA )
 }
 
 template <typename ADDER>
-static void AddDistinctKeys ( const CSphMatch & tEntry,	CSphAttrLocator & tDistinctLoc,
-		ESphAttr eDistinctAttr, const BYTE * pBlobPool, ADDER&& fnAdd )
+static void AddDistinctKeys ( const CSphMatch & tEntry, CSphAttrLocator & tDistinctLoc, ESphAttr eDistinctAttr, const BYTE * pBlobPool, ADDER && fnAdd )
 {
 	switch ( eDistinctAttr )
 	{
@@ -2310,6 +2309,17 @@ static void AddDistinctKeys ( const CSphMatch & tEntry,	CSphAttrLocator & tDisti
 		}
 		break;
 
+	case SPH_ATTR_JSON_FIELD:
+		PushJsonField ( tEntry.GetAttr(tDistinctLoc), pBlobPool, [fnAdd]( SphAttr_t * pAttr, SphGroupKey_t uGroupKey )
+			{
+				if ( uGroupKey )
+					fnAdd(uGroupKey);
+
+				return true;
+			}
+		);
+		break;
+
 	case SPH_ATTR_UINT32SET:
 	case SPH_ATTR_UINT32SET_PTR:
 		AddGroupedMVA<DWORD> ( fnAdd, tEntry.FetchAttrData ( tDistinctLoc, pBlobPool ) );
@@ -3893,6 +3903,98 @@ struct MvaNGroupSorter_c : public MVAGroupSorter_T < CSphKBufferNGroupSorter < C
 };
 
 
+template <typename PUSH>
+bool PushJsonField ( int64_t iValue, const BYTE * pBlobPool, PUSH && fnPush )
+{
+	int iLen;
+	char szBuf[32];
+	SphGroupKey_t uGroupKey;
+
+	ESphJsonType eJson = sphJsonUnpackType ( iValue );
+	const BYTE * pValue = pBlobPool + sphJsonUnpackOffset ( iValue );
+
+	switch ( eJson )
+	{
+	case JSON_ROOT:
+		{
+			iLen = sphJsonNodeSize ( JSON_ROOT, pValue );
+			bool bEmpty = iLen==5; // mask and JSON_EOF
+			uGroupKey = bEmpty ? 0 : sphFNV64 ( pValue, iLen );
+			return fnPush ( bEmpty ? nullptr : &iValue, uGroupKey );
+		}
+
+	case JSON_STRING:
+	case JSON_OBJECT:
+	case JSON_MIXED_VECTOR:
+		iLen = sphJsonUnpackInt ( &pValue );
+		uGroupKey = ( iLen==1 && eJson!=JSON_STRING ) ? 0 : sphFNV64 ( pValue, iLen );
+		return fnPush ( ( iLen==1 && eJson!=JSON_STRING ) ? nullptr : &iValue, uGroupKey );
+
+	case JSON_STRING_VECTOR:
+		{
+			bool bRes = false;
+			sphJsonUnpackInt ( &pValue );
+			iLen = sphJsonUnpackInt ( &pValue );
+			for ( int i=0;i<iLen;i++ )
+			{
+				int64_t iNewValue = sphJsonPackTypeOffset ( JSON_STRING, pValue-pBlobPool );
+
+				int iStrLen = sphJsonUnpackInt ( &pValue );
+				uGroupKey = sphFNV64 ( pValue, iStrLen );
+				bRes |= fnPush ( &iNewValue, uGroupKey );
+				pValue += iStrLen;
+			}
+			return bRes;
+		}
+
+	case JSON_INT32:
+		return fnPush ( &iValue, sphFNV64 ( (BYTE*)FormatInt ( szBuf, (int)sphGetDword(pValue) ) ) );
+
+	case JSON_INT64:
+		return fnPush ( &iValue, sphFNV64 ( (BYTE*)FormatInt ( szBuf, (int)sphJsonLoadBigint ( &pValue ) ) ) );
+
+	case JSON_DOUBLE:
+		snprintf ( szBuf, sizeof(szBuf), "%f", sphQW2D ( sphJsonLoadBigint ( &pValue ) ) );
+		return fnPush ( &iValue, sphFNV64 ( (const BYTE*)szBuf ) );
+
+	case JSON_INT32_VECTOR:
+	{
+		bool bRes = false;
+		iLen = sphJsonUnpackInt ( &pValue );
+		auto p = (int*)pValue;
+		for ( int i=0;i<iLen;i++ )
+		{
+			int64_t iPacked = sphJsonPackTypeOffset ( JSON_INT32, (BYTE*)p-pBlobPool );
+			uGroupKey = *p++;
+			bRes |= fnPush ( &iPacked, uGroupKey );
+		}
+		return bRes;
+	}
+
+	case JSON_INT64_VECTOR:
+	case JSON_DOUBLE_VECTOR:
+	{
+		bool bRes = false;
+		iLen = sphJsonUnpackInt ( &pValue );
+		auto p = (int64_t*)pValue;
+		ESphJsonType eType = eJson==JSON_INT64_VECTOR ? JSON_INT64 : JSON_DOUBLE;
+		for ( int i=0;i<iLen;i++ )
+		{
+			int64_t iPacked = sphJsonPackTypeOffset ( eType, (BYTE*)p-pBlobPool );
+			uGroupKey = *p++;
+			bRes |= fnPush ( &iPacked, uGroupKey );
+		}
+		return bRes;
+	}
+
+	default:
+		uGroupKey = 0;
+		iValue = 0;
+		return fnPush ( &iValue, uGroupKey );
+	}
+}
+
+
 /// match sorter with k-buffering and group-by for JSON arrays
 template < typename COMPGROUP, bool DISTINCT, bool NOTIFICATIONS >
 class CSphKBufferJsonGroupSorter : public CSphKBufferGroupSorter < COMPGROUP, DISTINCT, NOTIFICATIONS >
@@ -3914,97 +4016,17 @@ class CSphKBufferJsonGroupSorter : public CSphKBufferGroupSorter < COMPGROUP, DI
 	/// add entry to the queue
 	bool Push ( const CSphMatch & tMatch ) override
 	{
-		bool bRes = false;
+		SphGroupKey_t uGroupKey = this->m_pGrouper->KeyFromMatch ( tMatch );
 
-		int iLen;
-		char sBuf[32];
-
-		SphGroupKey_t uGroupkey = this->m_pGrouper->KeyFromMatch ( tMatch );
-
-		auto iValue = (int64_t)uGroupkey;
-		CSphGrouper* pGrouper = this->m_pGrouper;
+		auto iValue = (int64_t)uGroupKey;
+		CSphGrouper * pGrouper = this->m_pGrouper;
 		const BYTE * pBlobPool = ((CSphGrouperJsonField*)pGrouper)->GetBlobPool();
 
-		ESphJsonType eJson = sphJsonUnpackType ( iValue );
-		const BYTE * pValue = pBlobPool + sphJsonUnpackOffset ( iValue );
-
-		switch ( eJson )
-		{
-		case JSON_ROOT:
-			{
-				iLen = sphJsonNodeSize ( JSON_ROOT, pValue );
-				bool bEmpty = iLen==5; // mask and JSON_EOF
-				uGroupkey = bEmpty ? 0 : sphFNV64 ( pValue, iLen );
-				return this->PushEx ( tMatch, uGroupkey, false, false, bEmpty ? nullptr : &iValue );
-			}
-		case JSON_STRING:
-		case JSON_OBJECT:
-		case JSON_MIXED_VECTOR:
-			iLen = sphJsonUnpackInt ( &pValue );
-			uGroupkey = ( iLen==1 && eJson!=JSON_STRING ) ? 0 : sphFNV64 ( pValue, iLen );
-			return this->PushEx ( tMatch, uGroupkey, false, false, ( iLen==1 && eJson!=JSON_STRING ) ? 0: &iValue );
-
-		case JSON_STRING_VECTOR:
-			{
-				sphJsonUnpackInt ( &pValue );
-				iLen = sphJsonUnpackInt ( &pValue );
-				for ( int i=0;i<iLen;i++ )
-				{
-					int64_t iNewValue = sphJsonPackTypeOffset ( JSON_STRING, pValue-pBlobPool );
-
-					int iStrLen = sphJsonUnpackInt ( &pValue );
-					uGroupkey = sphFNV64 ( pValue, iStrLen );
-					bRes |= this->PushEx ( tMatch, uGroupkey, false, false, &iNewValue );
-					pValue += iStrLen;
-				}
-				return bRes;
-			}
-		case JSON_INT32:
-			uGroupkey = sphFNV64 ( (BYTE*)FormatInt ( sBuf, (int)sphGetDword(pValue) ) );
-			break;
-		case JSON_INT64:
-			uGroupkey = sphFNV64 ( (BYTE*)FormatInt ( sBuf, (int)sphJsonLoadBigint ( &pValue ) ) );
-			break;
-		case JSON_DOUBLE:
-			snprintf ( sBuf, sizeof(sBuf), "%f", sphQW2D ( sphJsonLoadBigint ( &pValue ) ) );
-			uGroupkey = sphFNV64 ( (const BYTE*)sBuf );
-			break;
-		case JSON_INT32_VECTOR:
-			{
-				iLen = sphJsonUnpackInt ( &pValue );
-				auto p = (int*)pValue;
-				for ( int i=0;i<iLen;i++ )
-				{
-					int64_t iPacked = sphJsonPackTypeOffset ( JSON_INT32, (BYTE*)p-pBlobPool );
-					uGroupkey = *p++;
-					bRes |= this->PushEx ( tMatch, uGroupkey, false, false, &iPacked );
-				}
-				return bRes;
-			}
-			break;
-		case JSON_INT64_VECTOR:
-		case JSON_DOUBLE_VECTOR:
+		return PushJsonField ( iValue, pBlobPool, [this, &tMatch]( SphAttr_t * pAttr, SphGroupKey_t uGroupKey )
 			{
-				iLen = sphJsonUnpackInt ( &pValue );
-				auto p = (int64_t*)pValue;
-				ESphJsonType eType = eJson==JSON_INT64_VECTOR ? JSON_INT64 : JSON_DOUBLE;
-				for ( int i=0;i<iLen;i++ )
-				{
-					int64_t iPacked = sphJsonPackTypeOffset ( eType, (BYTE*)p-pBlobPool );
-					uGroupkey = *p++;
-					bRes |= this->PushEx ( tMatch, uGroupkey, false, false, &iPacked );
-				}
-				return bRes;
+				return this->PushEx ( tMatch, uGroupKey, false, false, pAttr );
 			}
-			break;
-		default:
-			uGroupkey = 0;
-			iValue = 0;
-			break;
-		}
-
-		bRes |= this->PushEx ( tMatch, uGroupkey, false, false, &iValue );
-		return bRes;
+		);
 	}
 
 	/// add pre-grouped entry to the queue

diff --git a/test/test_412/model.bin b/test/test_412/model.bin
@@ -0,0 +1 @@
+a:1:{i:0;a:7:{i:0;a:2:{s:8:"sphinxql";s:30:"create table t(f text, j json)";s:14:"total_affected";i:0;}i:1;a:2:{s:8:"sphinxql";s:40:"insert into t(id,j) values(1,'{"id":6}')";s:14:"total_affected";i:1;}i:2;a:2:{s:8:"sphinxql";s:40:"insert into t(id,j) values(2,'{"id":7}')";s:14:"total_affected";i:1;}i:3;a:3:{s:8:"sphinxql";s:39:"select j.id i, count(distinct i) from t";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:2:{s:1:"i";s:1:"7";s:17:"count(distinct i)";s:1:"2";}}}i:4;a:2:{s:8:"sphinxql";s:16:"flush ramchunk t";s:14:"total_affected";i:0;}i:5;a:3:{s:8:"sphinxql";s:39:"select j.id i, count(distinct i) from t";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:2:{s:1:"i";s:1:"7";s:17:"count(distinct i)";s:1:"2";}}}i:6;a:2:{s:8:"sphinxql";s:12:"drop table t";s:14:"total_affected";i:0;}}}
diff --git a/test/test_412/test.xml b/test/test_412/test.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="utf-8"?>
+<test>
+
+<name>json vs count distinct</name>
+<skip_indexer/>
+
+<config>
+indexer
+{
+	mem_limit		= 16M
+}
+
+searchd
+{
+	<searchd_Settings/>
+	data_dir = <data_path path="data0"/>
+}
+</config>
+
+<sphqueries>
+
+<sphinxql>create table t(f text, j json)</sphinxql>
+<sphinxql>insert into t(id,j) values(1,'{"id":6}')</sphinxql>
+<sphinxql>insert into t(id,j) values(2,'{"id":7}')</sphinxql>
+<sphinxql>select j.id i, count(distinct i) from t</sphinxql>
+
+<sphinxql>flush ramchunk t</sphinxql>
+<sphinxql>select j.id i, count(distinct i) from t</sphinxql>
+
+<sphinxql>drop table t</sphinxql>
+
+</sphqueries>
+
+</test>
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		a:1:{i:0;a:7:{i:0;a:2:{s:8:"sphinxql";s:30:"create table t(f text, j json)";s:14:"total_affected";i:0;}i:1;a:2:{s:8:"sphinxql";s:40:"insert into t(id,j) values(1,'{"id":6}')";s:14:"total_affected";i:1;}i:2;a:2:{s:8:"sphinxql";s:40:"insert into t(id,j) values(2,'{"id":7}')";s:14:"total_affected";i:1;}i:3;a:3:{s:8:"sphinxql";s:39:"select j.id i, count(distinct i) from t";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:2:{s:1:"i";s:1:"7";s:17:"count(distinct i)";s:1:"2";}}}i:4;a:2:{s:8:"sphinxql";s:16:"flush ramchunk t";s:14:"total_affected";i:0;}i:5;a:3:{s:8:"sphinxql";s:39:"select j.id i, count(distinct i) from t";s:10:"total_rows";i:1;s:4:"rows";a:1:{i:0;a:2:{s:1:"i";s:1:"7";s:17:"count(distinct i)";s:1:"2";}}}i:6;a:2:{s:8:"sphinxql";s:12:"drop table t";s:14:"total_affected";i:0;}}}