Fix CUDA kangaroo not compiling

ZenulAbidin · Mar 24, 2021 · e89425f · e89425f
1 parent 7bce2e7
commit e89425f
Show file tree

Hide file tree

Showing 21 changed files with 2,450 additions and 39 deletions.
diff --git a/Check.cpp b/Check.cpp
@@ -512,8 +512,10 @@ void Kangaroo::Check(std::vector<int> gpuId,std::vector<int> gridSize) {
     for(int i=0;i<nb;i++) lastJump[i]=NB_JUMP;
 
     CreateJumpTable();
-
-    h.SetParams(dMask,jumpDistance,jumpPointx,jumpPointy);
+
+    Int dMaskInt;
+    HashTable::toInt(&dMask,&dMaskInt);
+    h.SetParams(&dMaskInt,jumpDistance,jumpPointx,jumpPointy);
     h.SetWildOffset(&rangeWidthDiv2);
     h.SetKangaroos(cpuPx,cpuPy,cpuD);
 
@@ -554,7 +556,7 @@ void Kangaroo::Check(std::vector<int> gpuId,std::vector<int> gridSize) {
         lastJump[i] = jmp;
 #endif
 
-        if(IsDP(cpuPx[i].bits64[3])) {
+        if(IsDP(&cpuPx[i])) {
 
           // Search for DP found
           bool found = false;

diff --git a/GPU/.GPUEngine.cu.swp b/GPU/.GPUEngine.cu.swp
diff --git a/GPU/.GPUEngine.h.swp b/GPU/.GPUEngine.h.swp
diff --git a/GPU/GPUCompute.h b/GPU/GPUCompute.h
@@ -19,7 +19,7 @@
 
 // -----------------------------------------------------------------------------------------
 
-__device__ void ComputeKangaroos(uint64_t *kangaroos,uint32_t maxFound,uint32_t *out,uint64_t dpMask) {
+__device__ void ComputeKangaroos(uint64_t *kangaroos,uint32_t maxFound,uint32_t *out,uint64_t *dpMask) {
 
   uint64_t px[GPU_GRP_SIZE][4];
   uint64_t py[GPU_GRP_SIZE][4];
@@ -34,6 +34,7 @@ __device__ void ComputeKangaroos(uint64_t *kangaroos,uint32_t maxFound,uint32_t
   uint64_t ry[4];
   uint64_t _s[4];
   uint64_t _p[4];
+  uint64_t dpmask0, dpmask1, dpmask2, dpmask3;
   uint32_t jmp;
 
 #ifdef USE_SYMMETRY
@@ -42,6 +43,11 @@ __device__ void ComputeKangaroos(uint64_t *kangaroos,uint32_t maxFound,uint32_t
   LoadKangaroos(kangaroos,px,py,dist);
 #endif
 
+  dpmask0 = dpMask[0];
+  dpmask1 = dpMask[1];
+  dpmask2 = dpMask[2];
+  dpmask3 = dpMask[3];
+
   for(int run = 0; run < NB_RUN; run++) {
 
     // P1 = jumpPoint
@@ -86,14 +92,14 @@ __device__ void ComputeKangaroos(uint64_t *kangaroos,uint32_t maxFound,uint32_t
       Load256(px[g],rx);
       Load256(py[g],ry);
 
-      Add128(dist[g],jD[jmp]);
+      Add256(dist[g],jD[jmp]);
 
 #ifdef USE_SYMMETRY
       if(ModPositive256(py[g]))
         ModNeg256Order(dist[g]);
 #endif
-
-      if((px[g][3] & dpMask) == 0) {
+      uint64_t *pxg = px[g];
+      if((pxg[0] & dpmask0) == 0 && (pxg[1] & dpmask1) == 0 && (pxg[2] & dpmask2) == 0 && (pxg[3] & dpmask3) == 0) {
 
         // Distinguished point
         uint32_t pos = atomicAdd(out,1);

diff --git a/GPU/GPUEngine.cu b/GPU/GPUEngine.cu
@@ -32,7 +32,7 @@
 
 // ---------------------------------------------------------------------------------------
 
-__global__ void comp_kangaroos(uint64_t *kangaroos,uint32_t maxFound,uint32_t *found,uint64_t dpMask) {
+__global__ void comp_kangaroos(uint64_t *kangaroos,uint32_t maxFound,uint32_t *found,uint64_t *dpMask) {
 
   int xPtr = (blockIdx.x*blockDim.x*GPU_GRP_SIZE) * KSIZE; // x[4] , y[4] , d[2], lastJump
   ComputeKangaroos(kangaroos + xPtr,maxFound,found,dpMask);
@@ -198,6 +198,14 @@ GPUEngine::GPUEngine(int nbThreadGroup,int nbThreadPerGroup,int gpuId,uint32_t m
   outputItem = NULL;
   outputItemPinned = NULL;
   jumpPinned = NULL;
+  dpMask=NULL;
+
+  // dpMask
+  err = cudaMalloc((void **)&dpMask,32);
+  if(err != cudaSuccess) {
+    printf("GPUEngine: Allocate dpmask memory: %s\n",cudaGetErrorString(err));
+    return;
+  }
 
   // Input kangaroos
   kangarooSize = nbThread * GPU_GRP_SIZE * KSIZE * 8;
@@ -256,6 +264,7 @@ GPUEngine::GPUEngine(int nbThreadGroup,int nbThreadPerGroup,int gpuId,uint32_t m
 
 GPUEngine::~GPUEngine() {
 
+  if(dpMask) cudaFree(dpMask);
   if(inputKangaroo) cudaFree(inputKangaroo);
   if(outputItem) cudaFree(outputItem);
   if(inputKangarooPinned) cudaFreeHost(inputKangarooPinned);
@@ -413,7 +422,7 @@ void GPUEngine::SetKangaroos(Int *px,Int *py,Int *d) {
         if(idx % 2 == WILD) dOff.ModAddK1order(&wildOffset);
         inputKangarooPinned[g * strideSize + t + 8 * nbThreadPerGroup] = dOff.bits64[0];
         inputKangarooPinned[g * strideSize + t + 9 * nbThreadPerGroup] = dOff.bits64[1];
-        inputKangarooPinned[g * stridesize + t + 10 * nbThreadPerGroup] = dOff.bits64[2];
+        inputKangarooPinned[g * strideSize + t + 10 * nbThreadPerGroup] = dOff.bits64[2];
 	inputKangarooPinned[g * strideSize + t + 11 * nbThreadPerGroup] = dOff.bits64[3];
 #ifdef USE_SYMMETRY
         // Last jump
@@ -495,7 +504,6 @@ void GPUEngine::GetKangaroos(Int *px,Int *py,Int *d) {
 
 }
 
-// I think this is for public leys and initial distances
 void GPUEngine::SetKangaroo(uint64_t kIdx,Int *px,Int *py,Int *d) {
 
   int gSize = KSIZE * GPU_GRP_SIZE;
@@ -542,7 +550,7 @@ void GPUEngine::SetKangaroo(uint64_t kIdx,Int *px,Int *py,Int *d) {
 #ifdef USE_SYMMETRY
   // Last jump
   inputKangarooPinned[0] = (uint64_t)NB_JUMP;
-  cudaMemcpy(inputKangaroo + (b * blockSize + g * strideSize + t + 12 * nbThreadPerGroup),inputKangarooPinned,8,cudaMemcpyHostToDevice);"
+  cudaMemcpy(inputKangaroo + (b * blockSize + g * strideSize + t + 12 * nbThreadPerGroup),inputKangarooPinned,8,cudaMemcpyHostToDevice);
 #endif
 
 }
@@ -566,16 +574,20 @@ bool GPUEngine::callKernel() {
 
 }
 
-void GPUEngine::SetParams(uint64_t dpMask,Int *distance,Int *px,Int *py) {
-  
-  this->dpMask = dpMask;
+void GPUEngine::SetParams(Int *dpMask,Int *distance,Int *px,Int *py) {
+  uint64_t hostDpMask[4];
 
+  hostDpMask[0] = dpMask->bits64[0];
+  hostDpMask[1] = dpMask->bits64[1];
+  hostDpMask[2] = dpMask->bits64[2];
+  hostDpMask[3] = dpMask->bits64[3];
+  cudaMemcpy(this->dpMask, hostDpMask, 32, cudaMemcpyHostToDevice);
   for(int i=0;i< NB_JUMP;i++)
     memcpy(jumpPinned + 4*i,distance[i].bits64,32);
   cudaMemcpyToSymbol(jD,jumpPinned,jumpSize);
   cudaError_t err = cudaGetLastError();
   if(err != cudaSuccess) {
-    printf("GPUEngine: SetParams: Failed to copy to constant memory: %s\n",cudaGetErrorString(err));
+    printf("GPUEngine: SetParams: Failed to copy to constant memory (distance): %s\n",cudaGetErrorString(err));
     return;
   }
 
@@ -584,7 +596,7 @@ void GPUEngine::SetParams(uint64_t dpMask,Int *distance,Int *px,Int *py) {
   cudaMemcpyToSymbol(jPx,jumpPinned,jumpSize);
   err = cudaGetLastError();
   if(err != cudaSuccess) {
-    printf("GPUEngine: SetParams: Failed to copy to constant memory: %s\n",cudaGetErrorString(err));
+    printf("GPUEngine: SetParams: Failed to copy to constant memory (px): %s\n",cudaGetErrorString(err));
     return;
   }
 
@@ -593,7 +605,7 @@ void GPUEngine::SetParams(uint64_t dpMask,Int *distance,Int *px,Int *py) {
   cudaMemcpyToSymbol(jPy,jumpPinned,jumpSize);
   err = cudaGetLastError();
   if(err != cudaSuccess) {
-    printf("GPUEngine: SetParams: Failed to copy to constant memory: %s\n",cudaGetErrorString(err));
+    printf("GPUEngine: SetParams: Failed to copy to constant memory (py): %s\n",cudaGetErrorString(err));
     return;
   }
 

diff --git a/GPU/GPUEngine.h b/GPU/GPUEngine.h
@@ -44,10 +44,10 @@ class GPUEngine {
 
   GPUEngine(int nbThreadGroup,int nbThreadPerGroup,int gpuId,uint32_t maxFound);
   ~GPUEngine();
-  void SetParams(uint64_t dpMask,Int *distance,Int *px,Int *py);
-  void SetKangaroos(uint64_t kIdx, Int *px,Int *py,Int *d);
+  void SetParams(Int *dpMask,Int *distance,Int *px,Int *py);
+  void SetKangaroos(Int *px,Int *py,Int *d);
   void GetKangaroos(Int *px,Int *py,Int *d);
-  void SetKangaroo(Int *px,Int *py,Int *d);
+  void SetKangaroo(uint64_t kIdx, Int *px,Int *py,Int *d);
   bool Launch(std::vector<ITEM> &hashFound,bool spinWait = false);
   void SetWildOffset(Int *offset);
   int GetNbThread();
@@ -80,8 +80,7 @@ class GPUEngine {
   uint32_t kangarooSize;
   uint32_t kangarooSizePinned;
   uint32_t jumpSize;
-  uint64_t dpMask;
-
+  uint64_t *dpMask;
 };
 
 #endif // GPUENGINEH
diff --git a/HashTable.h b/HashTable.h
@@ -21,6 +21,7 @@
 #include <string>
 #include <vector>
 #include "SECPK1/Point.h"
+#include "Constants.h"
 #ifdef WIN64
 #include <Windows.h>
 #endif
@@ -33,23 +34,13 @@
 #define ADD_DUPLICATE 1
 #define ADD_COLLISION 2
 
-union int128_s {
-
-  uint8_t  i8[16];
-  uint16_t i16[8];
-  uint32_t i32[4];
-  uint64_t i64[2];
-
-};
-
 union int256_s {
 	uint8_t  i8[32];
 	uint16_t i16[16];
 	uint32_t i32[8];
 	uint64_t i64[4];
 };
 
-typedef union int128_s int128_t;
 typedef union int256_s int256_t;
 
 #define safe_free(x) if(x) {free(x);x=NULL;}

diff --git a/Kangaroo.cpp b/Kangaroo.cpp
@@ -557,7 +557,9 @@ void Kangaroo::SolveKeyGPU(TH_PARAM *ph) {
 #else
   gpu->SetWildOffset(&rangeWidthDiv2);
 #endif
-  gpu->SetParams(dMask,jumpDistance,jumpPointx,jumpPointy);
+  Int dmaskInt;
+  HashTable::toInt(&dMask, &dmaskInt);
+  gpu->SetParams(&dmaskInt,jumpDistance,jumpPointx,jumpPointy);
   gpu->SetKangaroos(ph->px,ph->py,ph->distance);
 
   if(workFile.length()==0 || !saveKangaroo) {

diff --git a/Kangaroo.h b/Kangaroo.h
@@ -181,7 +181,6 @@ class Kangaroo {
   void SaveWork(uint64_t totalCount,double totalTime,TH_PARAM *threads,int nbThread);
   void SaveServerWork();
   void FetchWalks(uint64_t nbWalk,Int *x,Int *y,Int *d);
-  void FetchWalks(uint64_t nbWalk,std::vector<int128_t>& kangs,Int* x,Int* y,Int* d);
   void FetchWalks(uint64_t nbWalk,std::vector<int256_t>& kangs,Int* x,Int* y,Int* d);
   void FectchKangaroos(TH_PARAM *threads);
   FILE *ReadHeader(std::string fileName,uint32_t *version,int type);

diff --git a/Makefile b/Makefile
@@ -40,10 +40,27 @@ OBJET = $(addprefix $(OBJDIR)/, \
 endif
 
 CXX        = g++
-CUDA       = /usr/local/cuda-8.0
-CXXCUDA    = /usr/bin/g++-4.8
+CUDA       = /usr/local/cuda
+CXXCUDA    = /usr/bin/g++
 NVCC       = $(CUDA)/bin/nvcc
 
+
+all: driverquery bsgs
+
+ifdef gpu
+ifndef ccap
+driverquery:
+	. ./detect_cuda.sh
+ccap=$(shell cat cuda_version.txt)
+else
+driverquery:
+	@echo "Compiling against manually selected CUDA version ${ccap}"
+endif
+else
+driverquery:
+endif
+
+
 ifdef gpu
 
 ifdef debug
@@ -79,8 +96,6 @@ endif
 $(OBJDIR)/%.o : %.cpp
 	$(CXX) $(CXXFLAGS) -o $@ -c $<
 
-all: bsgs
-
 bsgs: $(OBJET)
 	@echo Making Kangaroo...
 	$(CXX) $(OBJET) $(LFLAGS) -o kangaroo
@@ -101,4 +116,6 @@ clean:
 	@rm -f obj/*.o
 	@rm -f obj/GPU/*.o
 	@rm -f obj/SECPK1/*.o
+	@rm -f cuda_version.txt
+	@rm -f cuda_build_log.txt
 
diff --git a/Network.cpp b/Network.cpp
@@ -1016,7 +1016,7 @@ bool Kangaroo::GetKangaroosFromServer(std::string& fileName,std::vector<int256_t
     uint64_t point = (nbKangaroo / KANG_PER_BLOCK) / 32;
     uint64_t pointPrint = 0;
 
-    KBuff = (int256_t*)malloc(KANG_PER_BLOCK * sizeof(int128_t));
+    KBuff = (int256_t*)malloc(KANG_PER_BLOCK * sizeof(int256_t));
     kangs.reserve(nbKangaroo);
 
     checkSum.SetInt32(0);

diff --git a/detect_cuda.sh b/detect_cuda.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+ccap=""
+cd deviceQuery
+echo "Attempting to autodetect CUDA compute capability..."
+make >cuda_build_log.txt 2>&1 && ccap=$(./deviceQuery | grep "CUDA Capability" |  awk -F '    ' '{print $2}' | sort -n | head -n 1 | sed 's/\.//')
+if [ -n "${ccap}" ]; then
+	echo "Detected ccap=${ccap}"
+else
+	echo "Autodetection failed, falling back to ccap=30 (set the ccap variable to override this)"
+	ccap="30"
+fi
+cd -
+echo ${ccap} > cuda_version.txt