Various size increases

- Increase range size to 256 bits - Increase DP size to 256 bits - Network.cpp transmit 256-bit kangaroos - Backup.cpp export/import 256-bit kangaroos - XOR all uint64_t members of kangaroo to make hashtable index
ZenulAbidin · Mar 19, 2021 · 7401482 · 7401482
1 parent 354bb80
commit 7401482
Show file tree

Hide file tree

Showing 17 changed files with 271 additions and 191 deletions.
diff --git a/Backup.cpp b/Backup.cpp
@@ -230,7 +230,7 @@ void Kangaroo::FetchWalks(uint64_t nbWalk,Int *x,Int *y,Int *d) {
 
 }
 
-void Kangaroo::FetchWalks(uint64_t nbWalk,std::vector<int128_t>& kangs,Int* x,Int* y,Int* d) {
+void Kangaroo::FetchWalks(uint64_t nbWalk,std::vector<int256_t>& kangs,Int* x,Int* y,Int* d) {
 
   uint64_t n = 0;
 
@@ -248,8 +248,7 @@ void Kangaroo::FetchWalks(uint64_t nbWalk,std::vector<int128_t>& kangs,Int* x,In
     for(n = 0; n < avail; n++) {
 
       Int dist;
-      uint32_t type;
-      HashTable::CalcDistAndType(kangs[n],&dist,&type);
+      HashTable::CalcDist(&kangs[n],&dist);
       dists.push_back(dist);
 
     }
@@ -277,7 +276,6 @@ void Kangaroo::FetchWalks(uint64_t nbWalk,std::vector<int128_t>& kangs,Int* x,In
     }
 
     kangs.erase(kangs.begin(),kangs.begin() + avail);
-
   }
 
   if(avail < nbWalk) {
@@ -293,7 +291,7 @@ void Kangaroo::FectchKangaroos(TH_PARAM *threads) {
   double sFetch = Timer::get_tick();
 
   // From server
-  vector<int128_t> kangs;
+  vector<int256_t> kangs;
   if(saveKangarooByServer) {
     ::printf("FectchKangaroosFromServer");
     if(!GetKangaroosFromServer(workFile,kangs))
@@ -492,22 +490,21 @@ void Kangaroo::SaveWork(uint64_t totalCount,double totalTime,TH_PARAM *threads,i
     if(saveKangarooByServer) {
 
       ::printf("\nSaveWork (Kangaroo->Server): %s",fileName.c_str());
-      vector<int128_t> kangs;
+      vector<int256_t> kangs;
       for(int i = 0; i < nbThread; i++)
         totalWalk += threads[i].nbKangaroo;
       kangs.reserve(totalWalk);
 
       for(int i = 0; i < nbThread; i++) {
-        int128_t X;
-        int128_t D;
-        uint64_t h;
+        int256_t X;
+        int256_t D;
         for(uint64_t n = 0; n < threads[i].nbKangaroo; n++) {
-          HashTable::Convert(&threads[i].px[n],&threads[i].distance[n],n%2,&h,&X,&D);
-          kangs.push_back(D);
+          HashTable::Convert(&threads[i].px[n],&threads[i].distance[n],&X,&D);
+	  kangs.push_back(D);
         }
       }
       SendKangaroosToServer(fileName,kangs);
-      size = kangs.size()*16 + 16;
+      size = kangs.size()*32 + 32;
       goto end;
 
     } else {

diff --git a/Check.cpp b/Check.cpp
@@ -46,8 +46,8 @@ uint32_t Kangaroo::CheckHash(uint32_t h,uint32_t nbItem,HashTable* hT,FILE* f) {
     for(uint32_t i = 0; i < nbItem; i++) {
       e = hT->E[h].items[i];
       Int dist;
-      uint32_t kType;
-      HashTable::CalcDistAndType(e->d,&dist,&kType);
+      uint32_t kType = e->kType;
+      HashTable::CalcDist(&(e->d),&dist);
       dists.push_back(dist);
       types.push_back(kType);
     }
@@ -60,8 +60,8 @@ uint32_t Kangaroo::CheckHash(uint32_t h,uint32_t nbItem,HashTable* hT,FILE* f) {
       ::fread(items+i,32,1,f);
       e = items + i;
       Int dist;
-      uint32_t kType;
-      HashTable::CalcDistAndType(e->d,&dist,&kType);
+      uint32_t kType = e->kType;
+      HashTable::CalcDist(&(e->d),&dist);
       dists.push_back(dist);
       types.push_back(kType);
     }
@@ -88,8 +88,7 @@ uint32_t Kangaroo::CheckHash(uint32_t h,uint32_t nbItem,HashTable* hT,FILE* f) {
     if(hT)    e = hT->E[h].items[i];
     else      e = items + i;
 
-    uint32_t hC = S[i].x.bits64[2] & HASH_MASK;
-    ok = (hC == h) && (S[i].x.bits64[0] == e->x.i64[0]) && (S[i].x.bits64[1] == e->x.i64[1]);
+    ok = (S[i].x.bits64[0] == e->x.i64[0]) && (S[i].x.bits64[1] == e->x.i64[1]) && (S[i].x.bits64[2] == e->x.i64[2]) && (S[i].x.bits64[3] == e->x.i64[3]);;
     if(!ok) nbWrong++;
     //if(!ok) {
     //  ::printf("\nCheckWorkFile wrong at: %06X [%d]\n",h,i);

diff --git a/GPU/.GPUMath.h.swp b/GPU/.GPUMath.h.swp
diff --git a/GPU/GPUCompute.h b/GPU/GPUCompute.h
@@ -23,7 +23,7 @@ __device__ void ComputeKangaroos(uint64_t *kangaroos,uint32_t maxFound,uint32_t
 
   uint64_t px[GPU_GRP_SIZE][4];
   uint64_t py[GPU_GRP_SIZE][4];
-  uint64_t dist[GPU_GRP_SIZE][2];
+  uint64_t dist[GPU_GRP_SIZE][4];
 #ifdef USE_SYMMETRY
   uint64_t lastJump[GPU_GRP_SIZE];
 #endif

diff --git a/GPU/GPUEngine.cu b/GPU/GPUEngine.cu
@@ -121,6 +121,8 @@ int _ConvertSMVer2Cores(int major,int minor) {
     { 0x70,  64 },
     { 0x72,  64 },
     { 0x75,  64 },
+    { 0x80,  64 },
+    { 0x86,  64 },
     { -1, -1 } };
 
   int index = 0;
@@ -392,27 +394,30 @@ void GPUEngine::SetKangaroos(Int *px,Int *py,Int *d) {
       for(int t = 0; t < nbThreadPerGroup; t++) {
 
         // X
-        inputKangarooPinned[g * strideSize + t + 0 * nbThreadPerGroup] = px[idx].bits64[0];
-        inputKangarooPinned[g * strideSize + t + 1 * nbThreadPerGroup] = px[idx].bits64[1];
-        inputKangarooPinned[g * strideSize + t + 2 * nbThreadPerGroup] = px[idx].bits64[2];
-        inputKangarooPinned[g * strideSize + t + 3 * nbThreadPerGroup] = px[idx].bits64[3];
+	Int tpx = px[idx];
+        inputKangarooPinned[g * strideSize + t + 0 * nbThreadPerGroup] = tpx.bits64[0];
+        inputKangarooPinned[g * strideSize + t + 1 * nbThreadPerGroup] = tpx.bits64[1];
+        inputKangarooPinned[g * strideSize + t + 2 * nbThreadPerGroup] = tpx.bits64[2];
+        inputKangarooPinned[g * strideSize + t + 3 * nbThreadPerGroup] = tpx.bits64[3];
 
         // Y
-        inputKangarooPinned[g * strideSize + t + 4 * nbThreadPerGroup] = py[idx].bits64[0];
-        inputKangarooPinned[g * strideSize + t + 5 * nbThreadPerGroup] = py[idx].bits64[1];
-        inputKangarooPinned[g * strideSize + t + 6 * nbThreadPerGroup] = py[idx].bits64[2];
-        inputKangarooPinned[g * strideSize + t + 7 * nbThreadPerGroup] = py[idx].bits64[3];
+	Int tpy = py[idx];
+        inputKangarooPinned[g * strideSize + t + 4 * nbThreadPerGroup] = tpy.bits64[0];
+        inputKangarooPinned[g * strideSize + t + 5 * nbThreadPerGroup] = tpy.bits64[1];
+        inputKangarooPinned[g * strideSize + t + 6 * nbThreadPerGroup] = tpy.bits64[2];
+        inputKangarooPinned[g * strideSize + t + 7 * nbThreadPerGroup] = tpy.bits64[3];
 
         // Distance
         Int dOff;
         dOff.Set(&d[idx]);
         if(idx % 2 == WILD) dOff.ModAddK1order(&wildOffset);
         inputKangarooPinned[g * strideSize + t + 8 * nbThreadPerGroup] = dOff.bits64[0];
         inputKangarooPinned[g * strideSize + t + 9 * nbThreadPerGroup] = dOff.bits64[1];
-
+        inputKangarooPinned[g * stridesize + t + 10 * nbThreadPerGroup] = dOff.bits64[2];
+	inputKangarooPinned[g * strideSize + t + 11 * nbThreadPerGroup] = dOff.bits64[3];
 #ifdef USE_SYMMETRY
         // Last jump
-        inputKangarooPinned[t + 10 * nbThreadPerGroup] = (uint64_t)NB_JUMP;
+        inputKangarooPinned[t + 12 * nbThreadPerGroup] = (uint64_t)NB_JUMP;
 #endif
 
         idx++;
@@ -490,6 +495,7 @@ void GPUEngine::GetKangaroos(Int *px,Int *py,Int *d) {
 
 }
 
+// I think this is for public leys and initial distances
 void GPUEngine::SetKangaroo(uint64_t kIdx,Int *px,Int *py,Int *d) {
 
   int gSize = KSIZE * GPU_GRP_SIZE;
@@ -528,11 +534,15 @@ void GPUEngine::SetKangaroo(uint64_t kIdx,Int *px,Int *py,Int *d) {
   cudaMemcpy(inputKangaroo + (b * blockSize + g * strideSize + t + 8 * nbThreadPerGroup),inputKangarooPinned,8,cudaMemcpyHostToDevice);
   inputKangarooPinned[0] = dOff.bits64[1];
   cudaMemcpy(inputKangaroo + (b * blockSize + g * strideSize + t + 9 * nbThreadPerGroup),inputKangarooPinned,8,cudaMemcpyHostToDevice);
+  inputKangarooPinned[0] = dOff.bits64[2];
+  cudaMemcpy(inputKangaroo + (b * blockSize + g * strideSize + t + 10 * nbThreadPerGroup),inputKangarooPinned,8,cudaMemcpyHostToDevice);
+  inputKangarooPinned[0] = dOff.bits64[3];
+  cudaMemcpy(inputKangaroo + (b * blockSize + g * strideSize + t + 11 * nbThreadPerGroup),inputKangarooPinned,8,cudaMemcpyHostToDevice);
 
 #ifdef USE_SYMMETRY
   // Last jump
   inputKangarooPinned[0] = (uint64_t)NB_JUMP;
-  cudaMemcpy(inputKangaroo + (b * blockSize + g * strideSize + t + 10 * nbThreadPerGroup),inputKangarooPinned,8,cudaMemcpyHostToDevice);
+  cudaMemcpy(inputKangaroo + (b * blockSize + g * strideSize + t + 12 * nbThreadPerGroup),inputKangarooPinned,8,cudaMemcpyHostToDevice);"
 #endif
 
 }
@@ -561,8 +571,8 @@ void GPUEngine::SetParams(uint64_t dpMask,Int *distance,Int *px,Int *py) {
   this->dpMask = dpMask;
 
   for(int i=0;i< NB_JUMP;i++)
-    memcpy(jumpPinned + 2*i,distance[i].bits64,16);
-  cudaMemcpyToSymbol(jD,jumpPinned,jumpSize/2);
+    memcpy(jumpPinned + 4*i,distance[i].bits64,32);
+  cudaMemcpyToSymbol(jD,jumpPinned,jumpSize);
   cudaError_t err = cudaGetLastError();
   if(err != cudaSuccess) {
     printf("GPUEngine: SetParams: Failed to copy to constant memory: %s\n",cudaGetErrorString(err));
@@ -666,8 +676,8 @@ bool GPUEngine::Launch(std::vector<ITEM> &hashFound,bool spinWait) {
     uint64_t *d = (uint64_t *)(itemPtr + 8);
     it.d.bits64[0] = d[0];
     it.d.bits64[1] = d[1];
-    it.d.bits64[2] = 0;
-    it.d.bits64[3] = 0;
+    it.d.bits64[2] = d[2];
+    it.d.bits64[3] = d[3];
     it.d.bits64[4] = 0;
     if(it.kIdx % 2 == WILD) it.d.ModSubK1order(&wildOffset);
 

diff --git a/GPU/GPUEngine.h b/GPU/GPUEngine.h
@@ -34,7 +34,8 @@
 typedef struct {
   Int x;
   Int d;
-  uint64_t kIdx;
+  uint64_t kIdx; // Appears like this is used as kType
+  uint64_t h;
 } ITEM;
 
 class GPUEngine {
@@ -44,9 +45,9 @@ class GPUEngine {
   GPUEngine(int nbThreadGroup,int nbThreadPerGroup,int gpuId,uint32_t maxFound);
   ~GPUEngine();
   void SetParams(uint64_t dpMask,Int *distance,Int *px,Int *py);
-  void SetKangaroos(Int *px,Int *py,Int *d);
+  void SetKangaroos(uint64_t kIdx, Int *px,Int *py,Int *d);
   void GetKangaroos(Int *px,Int *py,Int *d);
-  void SetKangaroo(uint64_t kIdx,Int *px,Int *py,Int *d);
+  void SetKangaroo(Int *px,Int *py,Int *d);
   bool Launch(std::vector<ITEM> &hashFound,bool spinWait = false);
   void SetWildOffset(Int *offset);
   int GetNbThread();

diff --git a/GPU/GPUGenerate.cpp b/GPU/GPUGenerate.cpp
@@ -24,13 +24,13 @@ using namespace std;
 void GPUEngine::GenerateCode(Secp256K1 *secp) {
 
   // Compute generator table
-  Int jumpDistance[129];
-  Point jumpPoint[129];
+  Int jumpDistance[257];
+  Point jumpPoint[257];
 
   // Kangaroo jumps
   jumpPoint[0] = secp->G;
   jumpDistance[0].SetInt32(1);
-  for(int i = 1; i < 129; ++i) {
+  for(int i = 1; i < 257; ++i) {
     jumpDistance[i].Add(&jumpDistance[i - 1],&jumpDistance[i - 1]);
     jumpPoint[i] = secp->DoubleDirect(jumpPoint[i - 1]);
   }
@@ -40,22 +40,22 @@ void GPUEngine::GenerateCode(Secp256K1 *secp) {
 
   fprintf(f,"// File generated by GPUEngine::GenerateCode()\n");
 
-  fprintf(f,"// Jump distance table (Contains 1,2,4,...,2^129\n");
+  fprintf(f,"// Jump distance table (Contains 1,2,4,...,2^257\n");
   fprintf(f,"__device__ __constant__ uint64_t jD[][4] = {\n");
-  for(int i = 0; i < 129; i++) {
+  for(int i = 0; i < 257; i++) {
     fprintf(f,"  %s,\n",jumpDistance[i].GetC64Str(4).c_str());
   }
   fprintf(f,"};\n");
 
-  fprintf(f,"// Jump point table (Contains G,2G,4G,...,2^129.G)\n");
+  fprintf(f,"// Jump point table (Contains G,2G,4G,...,2^257.G)\n");
   fprintf(f,"__device__ __constant__ uint64_t jPx[][4] = {\n");
-  for(int i = 0; i < 129; i++) {
+  for(int i = 0; i < 257; i++) {
     fprintf(f,"  %s,\n",jumpPoint[i].x.GetC64Str(4).c_str());
   }
   fprintf(f,"};\n");
 
   fprintf(f,"__device__ __constant__ uint64_t jPy[][4] = {\n");
-  for(int i = 0; i < 129; i++) {
+  for(int i = 0; i < 257; i++) {
     fprintf(f,"  %s,\n",jumpPoint[i].y.GetC64Str(4).c_str());
   }
   fprintf(f,"};\n\n");

diff --git a/GPU/GPUMath.h b/GPU/GPUMath.h
@@ -48,7 +48,7 @@
 #define MADDS(r,a,b,c) asm volatile ("madc.hi.s64 %0, %1, %2, %3;" : "=l"(r) : "l"(a), "l"(b), "l"(c));
 
 // Jump distance
-__device__ __constant__ uint64_t jD[NB_JUMP][2];
+__device__ __constant__ uint64_t jD[NB_JUMP][4];
 // jump points
 __device__ __constant__ uint64_t jPx[NB_JUMP][4];
 __device__ __constant__ uint64_t jPy[NB_JUMP][4];
@@ -122,6 +122,14 @@ __device__ __constant__ uint64_t _O[] = { 0xBFD25E8CD0364141ULL,0xBAAEDCE6AF48A0
 
 // ---------------------------------------------------------------------------------------
 
+#define Add256(r,a) { \
+  UADDO1((r)[0], (a)[0]); \
+  UADDO1((r)[1], (a)[1]); \
+  UADDO1((r)[2], (a)[2]); \
+  UADD1((r)[3], (a)[3]);}
+
+// ---------------------------------------------------------------------------------------
+
 #define Neg(r) {\
 USUBO(r[0],0ULL,r[0]); \
 USUBC(r[1],0ULL,r[1]); \
@@ -183,16 +191,20 @@ out[pos*ITEM_SIZE32 + 9] = ((uint32_t *)d)[0]; \
 out[pos*ITEM_SIZE32 + 10] = ((uint32_t *)d)[1]; \
 out[pos*ITEM_SIZE32 + 11] = ((uint32_t *)d)[2]; \
 out[pos*ITEM_SIZE32 + 12] = ((uint32_t *)d)[3]; \
-out[pos*ITEM_SIZE32 + 13] = ((uint32_t *)idx)[0]; \
-out[pos*ITEM_SIZE32 + 14] = ((uint32_t *)idx)[1]; \
+out[pos*ITEM_SIZE32 + 13] = ((uint32_t *)d)[4]; \
+out[pos*ITEM_SIZE32 + 14] = ((uint32_t *)d)[5]; \
+out[pos*ITEM_SIZE32 + 15] = ((uint32_t *)d)[6]; \
+out[pos*ITEM_SIZE32 + 16] = ((uint32_t *)d)[7]; \
+out[pos*ITEM_SIZE32 + 17] = ((uint32_t *)idx)[0]; \
+out[pos*ITEM_SIZE32 + 18] = ((uint32_t *)idx)[1]; \
 }
 
 // ---------------------------------------------------------------------------------------
 
 #ifdef USE_SYMMETRY
-__device__ void LoadKangaroos(uint64_t *a,uint64_t px[GPU_GRP_SIZE][4],uint64_t py[GPU_GRP_SIZE][4],uint64_t dist[GPU_GRP_SIZE][2],uint64_t *jumps) {
+__device__ void LoadKangaroos(uint64_t *a,uint64_t px[GPU_GRP_SIZE][4],uint64_t py[GPU_GRP_SIZE][4],uint64_t dist[GPU_GRP_SIZE][4],uint64_t *jumps) {
 #else
-__device__ void LoadKangaroos(uint64_t * a,uint64_t px[GPU_GRP_SIZE][4],uint64_t py[GPU_GRP_SIZE][4],uint64_t dist[GPU_GRP_SIZE][2]) {
+__device__ void LoadKangaroos(uint64_t * a,uint64_t px[GPU_GRP_SIZE][4],uint64_t py[GPU_GRP_SIZE][4],uint64_t dist[GPU_GRP_SIZE][4]) {
 #endif
 
   __syncthreads();
@@ -216,15 +228,17 @@ __device__ void LoadKangaroos(uint64_t * a,uint64_t px[GPU_GRP_SIZE][4],uint64_t
 
     d64[0] = (a)[IDX + 8 * blockDim.x + stride];
     d64[1] = (a)[IDX + 9 * blockDim.x + stride];
+    d64[2] = (a)[IDX + 10 * blockDim.x + stride];
+    d64[3] = (a)[IDX + 11 * blockDim.x + stride];
 
 #ifdef USE_SYMMETRY
-    jumps[g] = (a)[IDX + 10 * blockDim.x + stride];
+    jumps[g] = (a)[IDX + 12 * blockDim.x + stride];
 #endif
   }
 
 }
 
-__device__ void LoadDists(uint64_t* a,uint64_t dist[GPU_GRP_SIZE][2]) {
+__device__ void LoadDists(uint64_t* a,uint64_t dist[GPU_GRP_SIZE][4]) {
 
   __syncthreads();
 
@@ -235,6 +249,8 @@ __device__ void LoadDists(uint64_t* a,uint64_t dist[GPU_GRP_SIZE][2]) {
 
     d64[0] = (a)[IDX + 8 * blockDim.x + stride];
     d64[1] = (a)[IDX + 9 * blockDim.x + stride];
+    d64[2] = (a)[IDX + 10 * blockDim.x + stride];
+    d64[3] = (a)[IDX + 11 * blockDim.x + stride];
 
   }
 
@@ -271,9 +287,9 @@ __device__ void LoadKangaroo(uint64_t* a,uint32_t stride,uint64_t px[4]) {
 // ---------------------------------------------------------------------------------------
 
 #ifdef USE_SYMMETRY
-__device__ void StoreKangaroos(uint64_t *a,uint64_t px[GPU_GRP_SIZE][4],uint64_t py[GPU_GRP_SIZE][4],uint64_t dist[GPU_GRP_SIZE][2],uint64_t *jumps) {
+__device__ void StoreKangaroos(uint64_t *a,uint64_t px[GPU_GRP_SIZE][4],uint64_t py[GPU_GRP_SIZE][4],uint64_t dist[GPU_GRP_SIZE][4],uint64_t *jumps) {
 #else
-__device__ void StoreKangaroos(uint64_t * a,uint64_t px[GPU_GRP_SIZE][4],uint64_t py[GPU_GRP_SIZE][4],uint64_t dist[GPU_GRP_SIZE][2]) {
+__device__ void StoreKangaroos(uint64_t * a,uint64_t px[GPU_GRP_SIZE][4],uint64_t py[GPU_GRP_SIZE][4],uint64_t dist[GPU_GRP_SIZE][4]) {
 #endif
 
   __syncthreads();
@@ -296,9 +312,11 @@ __device__ void StoreKangaroos(uint64_t * a,uint64_t px[GPU_GRP_SIZE][4],uint64_
 
     (a)[IDX + 8 * blockDim.x + stride] = d64[0];
     (a)[IDX + 9 * blockDim.x + stride] = d64[1];
+    (a)[IDX + 10 * blockDim.x + stride] = d64[2];
+    (a)[IDX + 11 * blockDim.x + stride] = d64[3];
 
 #ifdef USE_SYMMETRY
-    (a)[IDX + 10 * blockDim.x + stride] = jumps[g];
+    (a)[IDX + 12 * blockDim.x + stride] = jumps[g];
 #endif
   }
 
@@ -321,7 +339,7 @@ __device__ void StoreKangaroo(uint64_t* a,uint32_t stride,uint64_t px[4],uint64_
 
 }
 
-__device__ void StoreDists(uint64_t* a,uint64_t dist[GPU_GRP_SIZE][2]) {
+__device__ void StoreDists(uint64_t* a,uint64_t dist[GPU_GRP_SIZE][4]) {
 
   __syncthreads();
 
@@ -331,6 +349,8 @@ __device__ void StoreDists(uint64_t* a,uint64_t dist[GPU_GRP_SIZE][2]) {
 
     (a)[IDX + 8 * blockDim.x + stride] = d64[0];
     (a)[IDX + 9 * blockDim.x + stride] = d64[1];
+    (a)[IDX + 10 * blockDim.x + stride] = d64[2];
+    (a)[IDX + 11 * blockDim.x + stride] = d64[3];
 
   }