Skip to content

Commit

Permalink
Fix CUDA kangaroo not compiling
Browse files Browse the repository at this point in the history
  • Loading branch information
ZenulAbidin committed Mar 24, 2021
1 parent 7bce2e7 commit e89425f
Show file tree
Hide file tree
Showing 21 changed files with 2,450 additions and 39 deletions.
8 changes: 5 additions & 3 deletions Check.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -512,8 +512,10 @@ void Kangaroo::Check(std::vector<int> gpuId,std::vector<int> gridSize) {
for(int i=0;i<nb;i++) lastJump[i]=NB_JUMP;

CreateJumpTable();

h.SetParams(dMask,jumpDistance,jumpPointx,jumpPointy);

Int dMaskInt;
HashTable::toInt(&dMask,&dMaskInt);
h.SetParams(&dMaskInt,jumpDistance,jumpPointx,jumpPointy);
h.SetWildOffset(&rangeWidthDiv2);
h.SetKangaroos(cpuPx,cpuPy,cpuD);

Expand Down Expand Up @@ -554,7 +556,7 @@ void Kangaroo::Check(std::vector<int> gpuId,std::vector<int> gridSize) {
lastJump[i] = jmp;
#endif

if(IsDP(cpuPx[i].bits64[3])) {
if(IsDP(&cpuPx[i])) {

// Search for DP found
bool found = false;
Expand Down
Binary file added GPU/.GPUEngine.cu.swp
Binary file not shown.
Binary file added GPU/.GPUEngine.h.swp
Binary file not shown.
14 changes: 10 additions & 4 deletions GPU/GPUCompute.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

// -----------------------------------------------------------------------------------------

__device__ void ComputeKangaroos(uint64_t *kangaroos,uint32_t maxFound,uint32_t *out,uint64_t dpMask) {
__device__ void ComputeKangaroos(uint64_t *kangaroos,uint32_t maxFound,uint32_t *out,uint64_t *dpMask) {

uint64_t px[GPU_GRP_SIZE][4];
uint64_t py[GPU_GRP_SIZE][4];
Expand All @@ -34,6 +34,7 @@ __device__ void ComputeKangaroos(uint64_t *kangaroos,uint32_t maxFound,uint32_t
uint64_t ry[4];
uint64_t _s[4];
uint64_t _p[4];
uint64_t dpmask0, dpmask1, dpmask2, dpmask3;
uint32_t jmp;

#ifdef USE_SYMMETRY
Expand All @@ -42,6 +43,11 @@ __device__ void ComputeKangaroos(uint64_t *kangaroos,uint32_t maxFound,uint32_t
LoadKangaroos(kangaroos,px,py,dist);
#endif

dpmask0 = dpMask[0];
dpmask1 = dpMask[1];
dpmask2 = dpMask[2];
dpmask3 = dpMask[3];

for(int run = 0; run < NB_RUN; run++) {

// P1 = jumpPoint
Expand Down Expand Up @@ -86,14 +92,14 @@ __device__ void ComputeKangaroos(uint64_t *kangaroos,uint32_t maxFound,uint32_t
Load256(px[g],rx);
Load256(py[g],ry);

Add128(dist[g],jD[jmp]);
Add256(dist[g],jD[jmp]);

#ifdef USE_SYMMETRY
if(ModPositive256(py[g]))
ModNeg256Order(dist[g]);
#endif

if((px[g][3] & dpMask) == 0) {
uint64_t *pxg = px[g];
if((pxg[0] & dpmask0) == 0 && (pxg[1] & dpmask1) == 0 && (pxg[2] & dpmask2) == 0 && (pxg[3] & dpmask3) == 0) {

// Distinguished point
uint32_t pos = atomicAdd(out,1);
Expand Down
32 changes: 22 additions & 10 deletions GPU/GPUEngine.cu
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@

// ---------------------------------------------------------------------------------------

__global__ void comp_kangaroos(uint64_t *kangaroos,uint32_t maxFound,uint32_t *found,uint64_t dpMask) {
__global__ void comp_kangaroos(uint64_t *kangaroos,uint32_t maxFound,uint32_t *found,uint64_t *dpMask) {

int xPtr = (blockIdx.x*blockDim.x*GPU_GRP_SIZE) * KSIZE; // x[4] , y[4] , d[2], lastJump
ComputeKangaroos(kangaroos + xPtr,maxFound,found,dpMask);
Expand Down Expand Up @@ -198,6 +198,14 @@ GPUEngine::GPUEngine(int nbThreadGroup,int nbThreadPerGroup,int gpuId,uint32_t m
outputItem = NULL;
outputItemPinned = NULL;
jumpPinned = NULL;
dpMask=NULL;

// dpMask
err = cudaMalloc((void **)&dpMask,32);
if(err != cudaSuccess) {
printf("GPUEngine: Allocate dpmask memory: %s\n",cudaGetErrorString(err));
return;
}

// Input kangaroos
kangarooSize = nbThread * GPU_GRP_SIZE * KSIZE * 8;
Expand Down Expand Up @@ -256,6 +264,7 @@ GPUEngine::GPUEngine(int nbThreadGroup,int nbThreadPerGroup,int gpuId,uint32_t m

GPUEngine::~GPUEngine() {

if(dpMask) cudaFree(dpMask);
if(inputKangaroo) cudaFree(inputKangaroo);
if(outputItem) cudaFree(outputItem);
if(inputKangarooPinned) cudaFreeHost(inputKangarooPinned);
Expand Down Expand Up @@ -413,7 +422,7 @@ void GPUEngine::SetKangaroos(Int *px,Int *py,Int *d) {
if(idx % 2 == WILD) dOff.ModAddK1order(&wildOffset);
inputKangarooPinned[g * strideSize + t + 8 * nbThreadPerGroup] = dOff.bits64[0];
inputKangarooPinned[g * strideSize + t + 9 * nbThreadPerGroup] = dOff.bits64[1];
inputKangarooPinned[g * stridesize + t + 10 * nbThreadPerGroup] = dOff.bits64[2];
inputKangarooPinned[g * strideSize + t + 10 * nbThreadPerGroup] = dOff.bits64[2];
inputKangarooPinned[g * strideSize + t + 11 * nbThreadPerGroup] = dOff.bits64[3];
#ifdef USE_SYMMETRY
// Last jump
Expand Down Expand Up @@ -495,7 +504,6 @@ void GPUEngine::GetKangaroos(Int *px,Int *py,Int *d) {

}

// I think this is for public leys and initial distances
void GPUEngine::SetKangaroo(uint64_t kIdx,Int *px,Int *py,Int *d) {

int gSize = KSIZE * GPU_GRP_SIZE;
Expand Down Expand Up @@ -542,7 +550,7 @@ void GPUEngine::SetKangaroo(uint64_t kIdx,Int *px,Int *py,Int *d) {
#ifdef USE_SYMMETRY
// Last jump
inputKangarooPinned[0] = (uint64_t)NB_JUMP;
cudaMemcpy(inputKangaroo + (b * blockSize + g * strideSize + t + 12 * nbThreadPerGroup),inputKangarooPinned,8,cudaMemcpyHostToDevice);"
cudaMemcpy(inputKangaroo + (b * blockSize + g * strideSize + t + 12 * nbThreadPerGroup),inputKangarooPinned,8,cudaMemcpyHostToDevice);
#endif

}
Expand All @@ -566,16 +574,20 @@ bool GPUEngine::callKernel() {

}

void GPUEngine::SetParams(uint64_t dpMask,Int *distance,Int *px,Int *py) {
this->dpMask = dpMask;
void GPUEngine::SetParams(Int *dpMask,Int *distance,Int *px,Int *py) {
uint64_t hostDpMask[4];

hostDpMask[0] = dpMask->bits64[0];
hostDpMask[1] = dpMask->bits64[1];
hostDpMask[2] = dpMask->bits64[2];
hostDpMask[3] = dpMask->bits64[3];
cudaMemcpy(this->dpMask, hostDpMask, 32, cudaMemcpyHostToDevice);
for(int i=0;i< NB_JUMP;i++)
memcpy(jumpPinned + 4*i,distance[i].bits64,32);
cudaMemcpyToSymbol(jD,jumpPinned,jumpSize);
cudaError_t err = cudaGetLastError();
if(err != cudaSuccess) {
printf("GPUEngine: SetParams: Failed to copy to constant memory: %s\n",cudaGetErrorString(err));
printf("GPUEngine: SetParams: Failed to copy to constant memory (distance): %s\n",cudaGetErrorString(err));
return;
}

Expand All @@ -584,7 +596,7 @@ void GPUEngine::SetParams(uint64_t dpMask,Int *distance,Int *px,Int *py) {
cudaMemcpyToSymbol(jPx,jumpPinned,jumpSize);
err = cudaGetLastError();
if(err != cudaSuccess) {
printf("GPUEngine: SetParams: Failed to copy to constant memory: %s\n",cudaGetErrorString(err));
printf("GPUEngine: SetParams: Failed to copy to constant memory (px): %s\n",cudaGetErrorString(err));
return;
}

Expand All @@ -593,7 +605,7 @@ void GPUEngine::SetParams(uint64_t dpMask,Int *distance,Int *px,Int *py) {
cudaMemcpyToSymbol(jPy,jumpPinned,jumpSize);
err = cudaGetLastError();
if(err != cudaSuccess) {
printf("GPUEngine: SetParams: Failed to copy to constant memory: %s\n",cudaGetErrorString(err));
printf("GPUEngine: SetParams: Failed to copy to constant memory (py): %s\n",cudaGetErrorString(err));
return;
}

Expand Down
9 changes: 4 additions & 5 deletions GPU/GPUEngine.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@ class GPUEngine {

GPUEngine(int nbThreadGroup,int nbThreadPerGroup,int gpuId,uint32_t maxFound);
~GPUEngine();
void SetParams(uint64_t dpMask,Int *distance,Int *px,Int *py);
void SetKangaroos(uint64_t kIdx, Int *px,Int *py,Int *d);
void SetParams(Int *dpMask,Int *distance,Int *px,Int *py);
void SetKangaroos(Int *px,Int *py,Int *d);
void GetKangaroos(Int *px,Int *py,Int *d);
void SetKangaroo(Int *px,Int *py,Int *d);
void SetKangaroo(uint64_t kIdx, Int *px,Int *py,Int *d);
bool Launch(std::vector<ITEM> &hashFound,bool spinWait = false);
void SetWildOffset(Int *offset);
int GetNbThread();
Expand Down Expand Up @@ -80,8 +80,7 @@ class GPUEngine {
uint32_t kangarooSize;
uint32_t kangarooSizePinned;
uint32_t jumpSize;
uint64_t dpMask;

uint64_t *dpMask;
};

#endif // GPUENGINEH
11 changes: 1 addition & 10 deletions HashTable.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <string>
#include <vector>
#include "SECPK1/Point.h"
#include "Constants.h"
#ifdef WIN64
#include <Windows.h>
#endif
Expand All @@ -33,23 +34,13 @@
#define ADD_DUPLICATE 1
#define ADD_COLLISION 2

union int128_s {

uint8_t i8[16];
uint16_t i16[8];
uint32_t i32[4];
uint64_t i64[2];

};

union int256_s {
uint8_t i8[32];
uint16_t i16[16];
uint32_t i32[8];
uint64_t i64[4];
};

typedef union int128_s int128_t;
typedef union int256_s int256_t;

#define safe_free(x) if(x) {free(x);x=NULL;}
Expand Down
4 changes: 3 additions & 1 deletion Kangaroo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -557,7 +557,9 @@ void Kangaroo::SolveKeyGPU(TH_PARAM *ph) {
#else
gpu->SetWildOffset(&rangeWidthDiv2);
#endif
gpu->SetParams(dMask,jumpDistance,jumpPointx,jumpPointy);
Int dmaskInt;
HashTable::toInt(&dMask, &dmaskInt);
gpu->SetParams(&dmaskInt,jumpDistance,jumpPointx,jumpPointy);
gpu->SetKangaroos(ph->px,ph->py,ph->distance);

if(workFile.length()==0 || !saveKangaroo) {
Expand Down
1 change: 0 additions & 1 deletion Kangaroo.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,6 @@ class Kangaroo {
void SaveWork(uint64_t totalCount,double totalTime,TH_PARAM *threads,int nbThread);
void SaveServerWork();
void FetchWalks(uint64_t nbWalk,Int *x,Int *y,Int *d);
void FetchWalks(uint64_t nbWalk,std::vector<int128_t>& kangs,Int* x,Int* y,Int* d);
void FetchWalks(uint64_t nbWalk,std::vector<int256_t>& kangs,Int* x,Int* y,Int* d);
void FectchKangaroos(TH_PARAM *threads);
FILE *ReadHeader(std::string fileName,uint32_t *version,int type);
Expand Down
25 changes: 21 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,27 @@ OBJET = $(addprefix $(OBJDIR)/, \
endif

CXX = g++
CUDA = /usr/local/cuda-8.0
CXXCUDA = /usr/bin/g++-4.8
CUDA = /usr/local/cuda
CXXCUDA = /usr/bin/g++
NVCC = $(CUDA)/bin/nvcc


all: driverquery bsgs

ifdef gpu
ifndef ccap
driverquery:
. ./detect_cuda.sh
ccap=$(shell cat cuda_version.txt)
else
driverquery:
@echo "Compiling against manually selected CUDA version ${ccap}"
endif
else
driverquery:
endif


ifdef gpu

ifdef debug
Expand Down Expand Up @@ -79,8 +96,6 @@ endif
$(OBJDIR)/%.o : %.cpp
$(CXX) $(CXXFLAGS) -o $@ -c $<

all: bsgs

bsgs: $(OBJET)
@echo Making Kangaroo...
$(CXX) $(OBJET) $(LFLAGS) -o kangaroo
Expand All @@ -101,4 +116,6 @@ clean:
@rm -f obj/*.o
@rm -f obj/GPU/*.o
@rm -f obj/SECPK1/*.o
@rm -f cuda_version.txt
@rm -f cuda_build_log.txt

2 changes: 1 addition & 1 deletion Network.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1016,7 +1016,7 @@ bool Kangaroo::GetKangaroosFromServer(std::string& fileName,std::vector<int256_t
uint64_t point = (nbKangaroo / KANG_PER_BLOCK) / 32;
uint64_t pointPrint = 0;

KBuff = (int256_t*)malloc(KANG_PER_BLOCK * sizeof(int128_t));
KBuff = (int256_t*)malloc(KANG_PER_BLOCK * sizeof(int256_t));
kangs.reserve(nbKangaroo);

checkSum.SetInt32(0);
Expand Down
13 changes: 13 additions & 0 deletions detect_cuda.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash
ccap=""
cd deviceQuery
echo "Attempting to autodetect CUDA compute capability..."
make >cuda_build_log.txt 2>&1 && ccap=$(./deviceQuery | grep "CUDA Capability" | awk -F ' ' '{print $2}' | sort -n | head -n 1 | sed 's/\.//')
if [ -n "${ccap}" ]; then
echo "Detected ccap=${ccap}"
else
echo "Autodetection failed, falling back to ccap=30 (set the ccap variable to override this)"
ccap="30"
fi
cd -
echo ${ccap} > cuda_version.txt
Loading

0 comments on commit e89425f

Please sign in to comment.