Skip to content

Commit

Permalink
Various size increases
Browse files Browse the repository at this point in the history
- Increase range size to 256 bits
- Increase DP size to 256 bits
- Network.cpp transmit 256-bit kangaroos
- Backup.cpp export/import 256-bit kangaroos
- XOR all uint64_t members of kangaroo to make hashtable index
  • Loading branch information
ZenulAbidin committed Mar 19, 2021
1 parent 354bb80 commit 7401482
Show file tree
Hide file tree
Showing 17 changed files with 271 additions and 191 deletions.
21 changes: 9 additions & 12 deletions Backup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ void Kangaroo::FetchWalks(uint64_t nbWalk,Int *x,Int *y,Int *d) {

}

void Kangaroo::FetchWalks(uint64_t nbWalk,std::vector<int128_t>& kangs,Int* x,Int* y,Int* d) {
void Kangaroo::FetchWalks(uint64_t nbWalk,std::vector<int256_t>& kangs,Int* x,Int* y,Int* d) {

uint64_t n = 0;

Expand All @@ -248,8 +248,7 @@ void Kangaroo::FetchWalks(uint64_t nbWalk,std::vector<int128_t>& kangs,Int* x,In
for(n = 0; n < avail; n++) {

Int dist;
uint32_t type;
HashTable::CalcDistAndType(kangs[n],&dist,&type);
HashTable::CalcDist(&kangs[n],&dist);
dists.push_back(dist);

}
Expand Down Expand Up @@ -277,7 +276,6 @@ void Kangaroo::FetchWalks(uint64_t nbWalk,std::vector<int128_t>& kangs,Int* x,In
}

kangs.erase(kangs.begin(),kangs.begin() + avail);

}

if(avail < nbWalk) {
Expand All @@ -293,7 +291,7 @@ void Kangaroo::FectchKangaroos(TH_PARAM *threads) {
double sFetch = Timer::get_tick();

// From server
vector<int128_t> kangs;
vector<int256_t> kangs;
if(saveKangarooByServer) {
::printf("FectchKangaroosFromServer");
if(!GetKangaroosFromServer(workFile,kangs))
Expand Down Expand Up @@ -492,22 +490,21 @@ void Kangaroo::SaveWork(uint64_t totalCount,double totalTime,TH_PARAM *threads,i
if(saveKangarooByServer) {

::printf("\nSaveWork (Kangaroo->Server): %s",fileName.c_str());
vector<int128_t> kangs;
vector<int256_t> kangs;
for(int i = 0; i < nbThread; i++)
totalWalk += threads[i].nbKangaroo;
kangs.reserve(totalWalk);

for(int i = 0; i < nbThread; i++) {
int128_t X;
int128_t D;
uint64_t h;
int256_t X;
int256_t D;
for(uint64_t n = 0; n < threads[i].nbKangaroo; n++) {
HashTable::Convert(&threads[i].px[n],&threads[i].distance[n],n%2,&h,&X,&D);
kangs.push_back(D);
HashTable::Convert(&threads[i].px[n],&threads[i].distance[n],&X,&D);
kangs.push_back(D);
}
}
SendKangaroosToServer(fileName,kangs);
size = kangs.size()*16 + 16;
size = kangs.size()*32 + 32;
goto end;

} else {
Expand Down
11 changes: 5 additions & 6 deletions Check.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ uint32_t Kangaroo::CheckHash(uint32_t h,uint32_t nbItem,HashTable* hT,FILE* f) {
for(uint32_t i = 0; i < nbItem; i++) {
e = hT->E[h].items[i];
Int dist;
uint32_t kType;
HashTable::CalcDistAndType(e->d,&dist,&kType);
uint32_t kType = e->kType;
HashTable::CalcDist(&(e->d),&dist);
dists.push_back(dist);
types.push_back(kType);
}
Expand All @@ -60,8 +60,8 @@ uint32_t Kangaroo::CheckHash(uint32_t h,uint32_t nbItem,HashTable* hT,FILE* f) {
::fread(items+i,32,1,f);
e = items + i;
Int dist;
uint32_t kType;
HashTable::CalcDistAndType(e->d,&dist,&kType);
uint32_t kType = e->kType;
HashTable::CalcDist(&(e->d),&dist);
dists.push_back(dist);
types.push_back(kType);
}
Expand All @@ -88,8 +88,7 @@ uint32_t Kangaroo::CheckHash(uint32_t h,uint32_t nbItem,HashTable* hT,FILE* f) {
if(hT) e = hT->E[h].items[i];
else e = items + i;

uint32_t hC = S[i].x.bits64[2] & HASH_MASK;
ok = (hC == h) && (S[i].x.bits64[0] == e->x.i64[0]) && (S[i].x.bits64[1] == e->x.i64[1]);
ok = (S[i].x.bits64[0] == e->x.i64[0]) && (S[i].x.bits64[1] == e->x.i64[1]) && (S[i].x.bits64[2] == e->x.i64[2]) && (S[i].x.bits64[3] == e->x.i64[3]);;
if(!ok) nbWrong++;
//if(!ok) {
// ::printf("\nCheckWorkFile wrong at: %06X [%d]\n",h,i);
Expand Down
Binary file added GPU/.GPUMath.h.swp
Binary file not shown.
2 changes: 1 addition & 1 deletion GPU/GPUCompute.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ __device__ void ComputeKangaroos(uint64_t *kangaroos,uint32_t maxFound,uint32_t

uint64_t px[GPU_GRP_SIZE][4];
uint64_t py[GPU_GRP_SIZE][4];
uint64_t dist[GPU_GRP_SIZE][2];
uint64_t dist[GPU_GRP_SIZE][4];
#ifdef USE_SYMMETRY
uint64_t lastJump[GPU_GRP_SIZE];
#endif
Expand Down
40 changes: 25 additions & 15 deletions GPU/GPUEngine.cu
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ int _ConvertSMVer2Cores(int major,int minor) {
{ 0x70, 64 },
{ 0x72, 64 },
{ 0x75, 64 },
{ 0x80, 64 },
{ 0x86, 64 },
{ -1, -1 } };

int index = 0;
Expand Down Expand Up @@ -392,27 +394,30 @@ void GPUEngine::SetKangaroos(Int *px,Int *py,Int *d) {
for(int t = 0; t < nbThreadPerGroup; t++) {

// X
inputKangarooPinned[g * strideSize + t + 0 * nbThreadPerGroup] = px[idx].bits64[0];
inputKangarooPinned[g * strideSize + t + 1 * nbThreadPerGroup] = px[idx].bits64[1];
inputKangarooPinned[g * strideSize + t + 2 * nbThreadPerGroup] = px[idx].bits64[2];
inputKangarooPinned[g * strideSize + t + 3 * nbThreadPerGroup] = px[idx].bits64[3];
Int tpx = px[idx];
inputKangarooPinned[g * strideSize + t + 0 * nbThreadPerGroup] = tpx.bits64[0];
inputKangarooPinned[g * strideSize + t + 1 * nbThreadPerGroup] = tpx.bits64[1];
inputKangarooPinned[g * strideSize + t + 2 * nbThreadPerGroup] = tpx.bits64[2];
inputKangarooPinned[g * strideSize + t + 3 * nbThreadPerGroup] = tpx.bits64[3];

// Y
inputKangarooPinned[g * strideSize + t + 4 * nbThreadPerGroup] = py[idx].bits64[0];
inputKangarooPinned[g * strideSize + t + 5 * nbThreadPerGroup] = py[idx].bits64[1];
inputKangarooPinned[g * strideSize + t + 6 * nbThreadPerGroup] = py[idx].bits64[2];
inputKangarooPinned[g * strideSize + t + 7 * nbThreadPerGroup] = py[idx].bits64[3];
Int tpy = py[idx];
inputKangarooPinned[g * strideSize + t + 4 * nbThreadPerGroup] = tpy.bits64[0];
inputKangarooPinned[g * strideSize + t + 5 * nbThreadPerGroup] = tpy.bits64[1];
inputKangarooPinned[g * strideSize + t + 6 * nbThreadPerGroup] = tpy.bits64[2];
inputKangarooPinned[g * strideSize + t + 7 * nbThreadPerGroup] = tpy.bits64[3];

// Distance
Int dOff;
dOff.Set(&d[idx]);
if(idx % 2 == WILD) dOff.ModAddK1order(&wildOffset);
inputKangarooPinned[g * strideSize + t + 8 * nbThreadPerGroup] = dOff.bits64[0];
inputKangarooPinned[g * strideSize + t + 9 * nbThreadPerGroup] = dOff.bits64[1];

inputKangarooPinned[g * stridesize + t + 10 * nbThreadPerGroup] = dOff.bits64[2];
inputKangarooPinned[g * strideSize + t + 11 * nbThreadPerGroup] = dOff.bits64[3];
#ifdef USE_SYMMETRY
// Last jump
inputKangarooPinned[t + 10 * nbThreadPerGroup] = (uint64_t)NB_JUMP;
inputKangarooPinned[t + 12 * nbThreadPerGroup] = (uint64_t)NB_JUMP;
#endif

idx++;
Expand Down Expand Up @@ -490,6 +495,7 @@ void GPUEngine::GetKangaroos(Int *px,Int *py,Int *d) {

}

// I think this is for public leys and initial distances
void GPUEngine::SetKangaroo(uint64_t kIdx,Int *px,Int *py,Int *d) {

int gSize = KSIZE * GPU_GRP_SIZE;
Expand Down Expand Up @@ -528,11 +534,15 @@ void GPUEngine::SetKangaroo(uint64_t kIdx,Int *px,Int *py,Int *d) {
cudaMemcpy(inputKangaroo + (b * blockSize + g * strideSize + t + 8 * nbThreadPerGroup),inputKangarooPinned,8,cudaMemcpyHostToDevice);
inputKangarooPinned[0] = dOff.bits64[1];
cudaMemcpy(inputKangaroo + (b * blockSize + g * strideSize + t + 9 * nbThreadPerGroup),inputKangarooPinned,8,cudaMemcpyHostToDevice);
inputKangarooPinned[0] = dOff.bits64[2];
cudaMemcpy(inputKangaroo + (b * blockSize + g * strideSize + t + 10 * nbThreadPerGroup),inputKangarooPinned,8,cudaMemcpyHostToDevice);
inputKangarooPinned[0] = dOff.bits64[3];
cudaMemcpy(inputKangaroo + (b * blockSize + g * strideSize + t + 11 * nbThreadPerGroup),inputKangarooPinned,8,cudaMemcpyHostToDevice);

#ifdef USE_SYMMETRY
// Last jump
inputKangarooPinned[0] = (uint64_t)NB_JUMP;
cudaMemcpy(inputKangaroo + (b * blockSize + g * strideSize + t + 10 * nbThreadPerGroup),inputKangarooPinned,8,cudaMemcpyHostToDevice);
cudaMemcpy(inputKangaroo + (b * blockSize + g * strideSize + t + 12 * nbThreadPerGroup),inputKangarooPinned,8,cudaMemcpyHostToDevice);"
#endif
}
Expand Down Expand Up @@ -561,8 +571,8 @@ void GPUEngine::SetParams(uint64_t dpMask,Int *distance,Int *px,Int *py) {
this->dpMask = dpMask;
for(int i=0;i< NB_JUMP;i++)
memcpy(jumpPinned + 2*i,distance[i].bits64,16);
cudaMemcpyToSymbol(jD,jumpPinned,jumpSize/2);
memcpy(jumpPinned + 4*i,distance[i].bits64,32);
cudaMemcpyToSymbol(jD,jumpPinned,jumpSize);
cudaError_t err = cudaGetLastError();
if(err != cudaSuccess) {
printf("GPUEngine: SetParams: Failed to copy to constant memory: %s\n",cudaGetErrorString(err));
Expand Down Expand Up @@ -666,8 +676,8 @@ bool GPUEngine::Launch(std::vector<ITEM> &hashFound,bool spinWait) {
uint64_t *d = (uint64_t *)(itemPtr + 8);
it.d.bits64[0] = d[0];
it.d.bits64[1] = d[1];
it.d.bits64[2] = 0;
it.d.bits64[3] = 0;
it.d.bits64[2] = d[2];
it.d.bits64[3] = d[3];
it.d.bits64[4] = 0;
if(it.kIdx % 2 == WILD) it.d.ModSubK1order(&wildOffset);
Expand Down
7 changes: 4 additions & 3 deletions GPU/GPUEngine.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@
typedef struct {
Int x;
Int d;
uint64_t kIdx;
uint64_t kIdx; // Appears like this is used as kType
uint64_t h;
} ITEM;

class GPUEngine {
Expand All @@ -44,9 +45,9 @@ class GPUEngine {
GPUEngine(int nbThreadGroup,int nbThreadPerGroup,int gpuId,uint32_t maxFound);
~GPUEngine();
void SetParams(uint64_t dpMask,Int *distance,Int *px,Int *py);
void SetKangaroos(Int *px,Int *py,Int *d);
void SetKangaroos(uint64_t kIdx, Int *px,Int *py,Int *d);
void GetKangaroos(Int *px,Int *py,Int *d);
void SetKangaroo(uint64_t kIdx,Int *px,Int *py,Int *d);
void SetKangaroo(Int *px,Int *py,Int *d);
bool Launch(std::vector<ITEM> &hashFound,bool spinWait = false);
void SetWildOffset(Int *offset);
int GetNbThread();
Expand Down
16 changes: 8 additions & 8 deletions GPU/GPUGenerate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,13 @@ using namespace std;
void GPUEngine::GenerateCode(Secp256K1 *secp) {

// Compute generator table
Int jumpDistance[129];
Point jumpPoint[129];
Int jumpDistance[257];
Point jumpPoint[257];

// Kangaroo jumps
jumpPoint[0] = secp->G;
jumpDistance[0].SetInt32(1);
for(int i = 1; i < 129; ++i) {
for(int i = 1; i < 257; ++i) {
jumpDistance[i].Add(&jumpDistance[i - 1],&jumpDistance[i - 1]);
jumpPoint[i] = secp->DoubleDirect(jumpPoint[i - 1]);
}
Expand All @@ -40,22 +40,22 @@ void GPUEngine::GenerateCode(Secp256K1 *secp) {

fprintf(f,"// File generated by GPUEngine::GenerateCode()\n");

fprintf(f,"// Jump distance table (Contains 1,2,4,...,2^129\n");
fprintf(f,"// Jump distance table (Contains 1,2,4,...,2^257\n");
fprintf(f,"__device__ __constant__ uint64_t jD[][4] = {\n");
for(int i = 0; i < 129; i++) {
for(int i = 0; i < 257; i++) {
fprintf(f," %s,\n",jumpDistance[i].GetC64Str(4).c_str());
}
fprintf(f,"};\n");

fprintf(f,"// Jump point table (Contains G,2G,4G,...,2^129.G)\n");
fprintf(f,"// Jump point table (Contains G,2G,4G,...,2^257.G)\n");
fprintf(f,"__device__ __constant__ uint64_t jPx[][4] = {\n");
for(int i = 0; i < 129; i++) {
for(int i = 0; i < 257; i++) {
fprintf(f," %s,\n",jumpPoint[i].x.GetC64Str(4).c_str());
}
fprintf(f,"};\n");

fprintf(f,"__device__ __constant__ uint64_t jPy[][4] = {\n");
for(int i = 0; i < 129; i++) {
for(int i = 0; i < 257; i++) {
fprintf(f," %s,\n",jumpPoint[i].y.GetC64Str(4).c_str());
}
fprintf(f,"};\n\n");
Expand Down
42 changes: 31 additions & 11 deletions GPU/GPUMath.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
#define MADDS(r,a,b,c) asm volatile ("madc.hi.s64 %0, %1, %2, %3;" : "=l"(r) : "l"(a), "l"(b), "l"(c));

// Jump distance
__device__ __constant__ uint64_t jD[NB_JUMP][2];
__device__ __constant__ uint64_t jD[NB_JUMP][4];
// jump points
__device__ __constant__ uint64_t jPx[NB_JUMP][4];
__device__ __constant__ uint64_t jPy[NB_JUMP][4];
Expand Down Expand Up @@ -122,6 +122,14 @@ __device__ __constant__ uint64_t _O[] = { 0xBFD25E8CD0364141ULL,0xBAAEDCE6AF48A0

// ---------------------------------------------------------------------------------------

#define Add256(r,a) { \
UADDO1((r)[0], (a)[0]); \
UADDO1((r)[1], (a)[1]); \
UADDO1((r)[2], (a)[2]); \
UADD1((r)[3], (a)[3]);}

// ---------------------------------------------------------------------------------------

#define Neg(r) {\
USUBO(r[0],0ULL,r[0]); \
USUBC(r[1],0ULL,r[1]); \
Expand Down Expand Up @@ -183,16 +191,20 @@ out[pos*ITEM_SIZE32 + 9] = ((uint32_t *)d)[0]; \
out[pos*ITEM_SIZE32 + 10] = ((uint32_t *)d)[1]; \
out[pos*ITEM_SIZE32 + 11] = ((uint32_t *)d)[2]; \
out[pos*ITEM_SIZE32 + 12] = ((uint32_t *)d)[3]; \
out[pos*ITEM_SIZE32 + 13] = ((uint32_t *)idx)[0]; \
out[pos*ITEM_SIZE32 + 14] = ((uint32_t *)idx)[1]; \
out[pos*ITEM_SIZE32 + 13] = ((uint32_t *)d)[4]; \
out[pos*ITEM_SIZE32 + 14] = ((uint32_t *)d)[5]; \
out[pos*ITEM_SIZE32 + 15] = ((uint32_t *)d)[6]; \
out[pos*ITEM_SIZE32 + 16] = ((uint32_t *)d)[7]; \
out[pos*ITEM_SIZE32 + 17] = ((uint32_t *)idx)[0]; \
out[pos*ITEM_SIZE32 + 18] = ((uint32_t *)idx)[1]; \
}

// ---------------------------------------------------------------------------------------

#ifdef USE_SYMMETRY
__device__ void LoadKangaroos(uint64_t *a,uint64_t px[GPU_GRP_SIZE][4],uint64_t py[GPU_GRP_SIZE][4],uint64_t dist[GPU_GRP_SIZE][2],uint64_t *jumps) {
__device__ void LoadKangaroos(uint64_t *a,uint64_t px[GPU_GRP_SIZE][4],uint64_t py[GPU_GRP_SIZE][4],uint64_t dist[GPU_GRP_SIZE][4],uint64_t *jumps) {
#else
__device__ void LoadKangaroos(uint64_t * a,uint64_t px[GPU_GRP_SIZE][4],uint64_t py[GPU_GRP_SIZE][4],uint64_t dist[GPU_GRP_SIZE][2]) {
__device__ void LoadKangaroos(uint64_t * a,uint64_t px[GPU_GRP_SIZE][4],uint64_t py[GPU_GRP_SIZE][4],uint64_t dist[GPU_GRP_SIZE][4]) {
#endif

__syncthreads();
Expand All @@ -216,15 +228,17 @@ __device__ void LoadKangaroos(uint64_t * a,uint64_t px[GPU_GRP_SIZE][4],uint64_t

d64[0] = (a)[IDX + 8 * blockDim.x + stride];
d64[1] = (a)[IDX + 9 * blockDim.x + stride];
d64[2] = (a)[IDX + 10 * blockDim.x + stride];
d64[3] = (a)[IDX + 11 * blockDim.x + stride];

#ifdef USE_SYMMETRY
jumps[g] = (a)[IDX + 10 * blockDim.x + stride];
jumps[g] = (a)[IDX + 12 * blockDim.x + stride];
#endif
}

}

__device__ void LoadDists(uint64_t* a,uint64_t dist[GPU_GRP_SIZE][2]) {
__device__ void LoadDists(uint64_t* a,uint64_t dist[GPU_GRP_SIZE][4]) {

__syncthreads();

Expand All @@ -235,6 +249,8 @@ __device__ void LoadDists(uint64_t* a,uint64_t dist[GPU_GRP_SIZE][2]) {

d64[0] = (a)[IDX + 8 * blockDim.x + stride];
d64[1] = (a)[IDX + 9 * blockDim.x + stride];
d64[2] = (a)[IDX + 10 * blockDim.x + stride];
d64[3] = (a)[IDX + 11 * blockDim.x + stride];

}

Expand Down Expand Up @@ -271,9 +287,9 @@ __device__ void LoadKangaroo(uint64_t* a,uint32_t stride,uint64_t px[4]) {
// ---------------------------------------------------------------------------------------

#ifdef USE_SYMMETRY
__device__ void StoreKangaroos(uint64_t *a,uint64_t px[GPU_GRP_SIZE][4],uint64_t py[GPU_GRP_SIZE][4],uint64_t dist[GPU_GRP_SIZE][2],uint64_t *jumps) {
__device__ void StoreKangaroos(uint64_t *a,uint64_t px[GPU_GRP_SIZE][4],uint64_t py[GPU_GRP_SIZE][4],uint64_t dist[GPU_GRP_SIZE][4],uint64_t *jumps) {
#else
__device__ void StoreKangaroos(uint64_t * a,uint64_t px[GPU_GRP_SIZE][4],uint64_t py[GPU_GRP_SIZE][4],uint64_t dist[GPU_GRP_SIZE][2]) {
__device__ void StoreKangaroos(uint64_t * a,uint64_t px[GPU_GRP_SIZE][4],uint64_t py[GPU_GRP_SIZE][4],uint64_t dist[GPU_GRP_SIZE][4]) {
#endif

__syncthreads();
Expand All @@ -296,9 +312,11 @@ __device__ void StoreKangaroos(uint64_t * a,uint64_t px[GPU_GRP_SIZE][4],uint64_

(a)[IDX + 8 * blockDim.x + stride] = d64[0];
(a)[IDX + 9 * blockDim.x + stride] = d64[1];
(a)[IDX + 10 * blockDim.x + stride] = d64[2];
(a)[IDX + 11 * blockDim.x + stride] = d64[3];

#ifdef USE_SYMMETRY
(a)[IDX + 10 * blockDim.x + stride] = jumps[g];
(a)[IDX + 12 * blockDim.x + stride] = jumps[g];
#endif
}

Expand All @@ -321,7 +339,7 @@ __device__ void StoreKangaroo(uint64_t* a,uint32_t stride,uint64_t px[4],uint64_

}

__device__ void StoreDists(uint64_t* a,uint64_t dist[GPU_GRP_SIZE][2]) {
__device__ void StoreDists(uint64_t* a,uint64_t dist[GPU_GRP_SIZE][4]) {

__syncthreads();

Expand All @@ -331,6 +349,8 @@ __device__ void StoreDists(uint64_t* a,uint64_t dist[GPU_GRP_SIZE][2]) {

(a)[IDX + 8 * blockDim.x + stride] = d64[0];
(a)[IDX + 9 * blockDim.x + stride] = d64[1];
(a)[IDX + 10 * blockDim.x + stride] = d64[2];
(a)[IDX + 11 * blockDim.x + stride] = d64[3];

}

Expand Down
Loading

0 comments on commit 7401482

Please sign in to comment.