From 4e902231a996c215b80d771d76c1cea60f68bf82 Mon Sep 17 00:00:00 2001 From: JianxiaoYang Date: Wed, 25 Oct 2023 15:43:21 -0700 Subject: [PATCH] clean up --- src/cyclops/CyclicCoordinateDescent.cpp | 11 ++-- src/cyclops/engine/CudaKernel.cu | 10 ++- src/cyclops/engine/CudaKernel.h | 2 + src/cyclops/engine/GpuModelSpecificsCox.hpp | 69 +++++++++++++-------- src/cyclops/engine/ModelSpecifics.h | 2 +- src/cyclops/engine/ModelSpecifics.hpp | 10 +-- 6 files changed, 64 insertions(+), 40 deletions(-) diff --git a/src/cyclops/CyclicCoordinateDescent.cpp b/src/cyclops/CyclicCoordinateDescent.cpp index 844c2826..25145e1b 100644 --- a/src/cyclops/CyclicCoordinateDescent.cpp +++ b/src/cyclops/CyclicCoordinateDescent.cpp @@ -255,7 +255,6 @@ int CyclicCoordinateDescent::getAlignedLength(int N) { } void CyclicCoordinateDescent::computeNEvents() { -//<<<<<<< HEAD if (syncCV) { for (int i=0; i 0 ? cWeights.data() : nullptr, useCrossValidation); } -/* -======= +/* //modelSpecifics.setWeights( // hWeights.size() > 0 ? hWeights.data() : nullptr, // useCrossValidation); @@ -278,7 +276,6 @@ void CyclicCoordinateDescent::computeNEvents() { hWeights.size() > 0 ? hWeights.data() : nullptr, cWeights.size() > 0 ? cWeights.data() : nullptr, useCrossValidation); ->>>>>>> fine_gray */ } @@ -1768,7 +1765,7 @@ void CyclicCoordinateDescent::turnOnStreamCV(int foldToCompute) { streamCV = true; streamCVFolds = foldToCompute; - std::cout << "foldToCompute: " << foldToCompute << "\n"; +// std::cout << "foldToCompute: " << foldToCompute << "\n"; modelSpecifics.turnOnStreamCV(foldToCompute); } @@ -1777,7 +1774,7 @@ void CyclicCoordinateDescent::turnOnSyncCV(int foldToCompute) { syncCV = true; syncCVFolds = foldToCompute; - std::cout << "foldToCompute: " << foldToCompute << "\n"; +// std::cout << "foldToCompute: " << foldToCompute << "\n"; modelSpecifics.turnOnSyncCV(foldToCompute); for(int i=0; i CyclicCoordinateDescent::getPredictiveLogLikelihood(std::vec } } - std::cout << "iterations: " << lastIterationCount << " "; +// std::cout << "iterations: " << lastIterationCount << " "; return result; } diff --git a/src/cyclops/engine/CudaKernel.cu b/src/cyclops/engine/CudaKernel.cu index feec81d7..70108ab0 100644 --- a/src/cyclops/engine/CudaKernel.cu +++ b/src/cyclops/engine/CudaKernel.cu @@ -288,6 +288,7 @@ CudaKernel::CudaKernel(const std::string& deviceName) stream = (cudaStream_t *) malloc(sizeof(cudaStream_t)); cudaStreamCreate(&stream[0]); +#ifdef DEBUG_GPU_COX if (deviceStatus == cudaSuccess) { std::cout << "ctor CudaKernel on " << deviceName << " stream: " << stream << '\n'; } else if (deviceStatus == cudaErrorDeviceAlreadyInUse) { @@ -295,7 +296,7 @@ CudaKernel::CudaKernel(const std::string& deviceName) } else if (deviceStatus == cudaErrorInvalidDevice) { std::cout << "cudaErrorInvalidDevice \n"; } - +#endif } template @@ -315,7 +316,10 @@ CudaKernel::~CudaKernel() // cudaFree(boundOut); // cudaFree(temp); // cudaDeviceReset(); + +#ifdef DEBUG_GPU_COX std::cout << "dtor CudaKernel \n"; +#endif } /* template @@ -377,7 +381,9 @@ cudaStream_t* CudaKernel::getStream() { template const std::string CudaKernel::getDeviceName() { +#ifdef DEBUG_GPU_COX std::cout << "getDeviceName: " << desiredDeviceName << '\n'; +#endif return desiredDeviceName; } @@ -396,7 +402,9 @@ void CudaKernel::setFold(int currentFold) if (curIndex != devIndex) { // TODO: why and where is it set back to the default device? bool deviceFlag = cudaSetDevice(devIndex); +#ifdef DEBUG_GPU_COX std::cout << "SET DEVICE TO " << desiredDeviceName << " AGAIN at fold " << fold << '\n'; +#endif } } diff --git a/src/cyclops/engine/CudaKernel.h b/src/cyclops/engine/CudaKernel.h index 891d918a..bebb5b42 100644 --- a/src/cyclops/engine/CudaKernel.h +++ b/src/cyclops/engine/CudaKernel.h @@ -2,6 +2,8 @@ #include #include "../CompressedDataMatrix.h" +// #define DEBUG_GPU_COX + typedef typename bsccs::FormatType FormatType; enum FormatTypeCuda { diff --git a/src/cyclops/engine/GpuModelSpecificsCox.hpp b/src/cyclops/engine/GpuModelSpecificsCox.hpp index 112ba5c3..3c8239bf 100644 --- a/src/cyclops/engine/GpuModelSpecificsCox.hpp +++ b/src/cyclops/engine/GpuModelSpecificsCox.hpp @@ -37,11 +37,15 @@ class CudaAllGpuColumns { CudaAllGpuColumns() { // Do nothing +#ifdef DEBUG_GPU_COX std::cerr << "ctor CudaAllGpuColumns" << std::endl; +#endif } virtual ~CudaAllGpuColumns() { +#ifdef DEBUG_GPU_COX std::cerr << "dtor CudaAllGpuColumns" << std::endl; +#endif } void initialize(const CompressedDataMatrix& mat, @@ -49,8 +53,9 @@ class CudaAllGpuColumns { // std::vector flatData; // std::vector flatIndices; - +#ifdef DEBUG_GPU_COX std::cerr << "Cuda AGC start" << std::endl; +#endif UInt dataStart = 0; UInt indicesStart = 0; @@ -83,13 +88,15 @@ class CudaAllGpuColumns { taskCounts.push_back(column.getNumberOfEntries()); } } - +#ifdef DEBUG_GPU_COX std::cerr << "cuda AGC end " << flatData.size() << " " << flatIndices.size() << " " << dataStarts.size() << " " << indicesStarts.size() << " " << taskCounts.size() << std::endl; +#endif } void resizeAndCopyColumns (cudaStream_t* stream) { +#ifdef DEBUG_GPU_COX std::cout << "resizeAndCopyColumns \n"; - +#endif resizeAndCopyToDeviceCuda(flatData, data, stream); resizeAndCopyToDeviceCuda(flatIndices, indices, stream); resizeAndCopyToDeviceCuda(dataStarts, ddataStarts, stream); @@ -238,13 +245,17 @@ class GpuModelSpecificsCox : dAccNumer(), dAccNumer2(), dDecDenom(), dDecNumer(), dDecNumer2(), dKWeight(), dNWeight(), dYWeight(), CoxKernels(deviceName), dCudaColumns(){ +#ifdef DEBUG_GPU_COX std::cerr << "ctor GpuModelSpecificsCox" << std::endl; +#endif } virtual ~GpuModelSpecificsCox() { cudaFree(dGH); // cudaFreeHost(pGH); +#ifdef DEBUG_GPU_COX std::cerr << "dtor GpuModelSpecificsCox" << std::endl; +#endif } virtual AbstractModelSpecifics* clone(ComputeDeviceArguments computeDevice) const { @@ -269,9 +280,9 @@ virtual void setPidForAccumulation(const double* weights) { } } accReset.push_back(K); - +#ifdef DEBUG_GPU_COX std::cerr << "Num of strata: " << accReset.size() << std::endl; - +#endif // copy stratumId from host to device CoxKernels.resizeAndCopyToDeviceInt(hPidInternal, dPid); } @@ -284,7 +295,7 @@ virtual void deviceInitialization() { std::cerr << "start dI" << std::endl; #endif -#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING +#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX auto start = bsccs::chrono::steady_clock::now(); #endif // Initialize columns @@ -377,9 +388,11 @@ virtual void deviceInitialization() { dGH, N); } - +#ifdef DEBUG_GPU_COX std::cout << "K: " << K << " N: " << N << '\n'; -#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING +#endif + +#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX auto end = bsccs::chrono::steady_clock::now(); ///////////////////////////" duration["z cudaDevInit "] += bsccs::chrono::duration_cast(end - start).count(); @@ -427,13 +440,13 @@ virtual void setWeights(double* inWeights, double *cenWeights, bool useCrossVali } } -#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING +#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX auto start = bsccs::chrono::steady_clock::now(); #endif // Device CoxKernels.resizeAndCopyToDevice(hKWeight, dKWeight); CoxKernels.resizeAndCopyToDevice(hNWeight, dNWeight); -#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING +#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX auto end = bsccs::chrono::steady_clock::now(); ///////////////////////////" duration["z Data transfer "] += bsccs::chrono::duration_cast(end - start).count(); @@ -450,12 +463,12 @@ virtual void setWeights(double* inWeights, double *cenWeights, bool useCrossVali hYWeight[k] = cenWeights[k]; hYWeightDouble[k] = cenWeights[k]; } -#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING +#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX auto start = bsccs::chrono::steady_clock::now(); #endif // Device CoxKernels.resizeAndCopyToDevice(hYWeight, dYWeight); -#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING +#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX auto end = bsccs::chrono::steady_clock::now(); ///////////////////////////" duration["z Data transfer "] += bsccs::chrono::duration_cast(end - start).count(); @@ -467,12 +480,12 @@ virtual void setWeights(double* inWeights, double *cenWeights, bool useCrossVali virtual void computeFixedTermsInGradientAndHessian(bool useCrossValidation) { ModelSpecifics::computeFixedTermsInGradientAndHessian(useCrossValidation); -#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING +#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX auto start = bsccs::chrono::steady_clock::now(); #endif // resizeAndCopyToDeviceCuda(hXjY, dXjY); CoxKernels.resizeAndCopyToDevice(hXjY, dXjY); -#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING +#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX auto end = bsccs::chrono::steady_clock::now(); ///////////////////////////" duration["z Data transfer "] += bsccs::chrono::duration_cast(end - start).count(); @@ -527,7 +540,7 @@ virtual void computeRemainingStatistics(bool useWeights) { } } -#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING +#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX auto start = bsccs::chrono::steady_clock::now(); #endif // Device @@ -542,7 +555,7 @@ virtual void computeRemainingStatistics(bool useWeights) { CoxKernels.copyFromHostToDevice(denomPid, dDenominator); // CoxKernels.copyFromHostToDevice(accDenomPid, dAccDenom); -#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING +#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX auto end = bsccs::chrono::steady_clock::now(); ///////////////////////////" duration["z Data transfer "] += bsccs::chrono::duration_cast(end - start).count();; @@ -565,11 +578,11 @@ virtual double getLogLikelihood(bool useCrossValidation) { } else { CoxKernels.computeAccumlatedDenominator(dDenominator, dAccDenom, K); } -#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING +#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX auto start0 = bsccs::chrono::steady_clock::now(); #endif CoxKernels.copyFromDeviceToHost(dAccDenom, accDenomPid); -#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING +#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX auto end0 = bsccs::chrono::steady_clock::now(); ///////////////////////////" duration["z Data transfer "] += bsccs::chrono::duration_cast(end0 - start0).count();; @@ -812,21 +825,21 @@ virtual void updateBetaAndDelta(int index, bool useWeights) { ///////////////////////////" duration["updateXBetaG "] += bsccs::chrono::duration_cast(end4 - start4).count(); #endif -#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING +#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX duration["GPU GH "] += bsccs::chrono::duration_cast(end - start + end2 - start2 + end4 - start4).count(); #endif } virtual const std::vector getXBeta() { -#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING +#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX auto start = bsccs::chrono::steady_clock::now(); #endif if (!hXBetaKnown) { CoxKernels.copyFromDeviceToHost(dXBeta, hXBeta); hXBetaKnown = true; } -#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING +#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX auto end = bsccs::chrono::steady_clock::now(); ///////////////////////////" duration["z Data transfer "] += bsccs::chrono::duration_cast(end - start).count();; @@ -839,14 +852,14 @@ virtual const std::vector getXBetaSave() { } virtual void saveXBeta() { -#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING +#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX auto start = bsccs::chrono::steady_clock::now(); #endif if (!hXBetaKnown) { CoxKernels.copyFromDeviceToHost(dXBeta, hXBeta); hXBetaKnown = true; } -#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING +#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX auto end = bsccs::chrono::steady_clock::now(); ///////////////////////////" duration["z Data transfer "] += bsccs::chrono::duration_cast(end - start).count();; @@ -867,7 +880,7 @@ virtual void axpyXBeta(const double beta, const int j) { } virtual std::vector getBeta() { -#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING +#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX auto start = bsccs::chrono::steady_clock::now(); #endif CoxKernels.copyFromDeviceToDevice(dBound, dBoundBuffer); @@ -875,7 +888,7 @@ virtual std::vector getBeta() { CoxKernels.copyFromDeviceToHost(dBeta, RealHBeta); // CoxKernels.getBeta(RealHBeta); // CoxKernels.getBound(); -#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING +#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX auto end = bsccs::chrono::steady_clock::now(); ///////////////////////////" duration["z Data transfer "] += bsccs::chrono::duration_cast(end - start).count();; @@ -921,13 +934,17 @@ void turnOnStreamCV(int foldToCompute) { streamCV = true; streamCVFolds = foldToCompute; CoxKernels.allocStreams(streamCVFolds); +#ifdef DEBUG_GPU_COX std::cout << "GPUMS streamCVFolds: " << streamCVFolds << '\n'; +#endif } void setFold(int inFold){ fold = inFold; CoxKernels.setFold(inFold); -// std::cout << "GPUMS current fold: " << fold << '\n'; +#ifdef DEBUG_GPU_COX + std::cout << "GPUMS current fold: " << fold << '\n'; +#endif } private: diff --git a/src/cyclops/engine/ModelSpecifics.h b/src/cyclops/engine/ModelSpecifics.h index b4ac9505..884291b7 100644 --- a/src/cyclops/engine/ModelSpecifics.h +++ b/src/cyclops/engine/ModelSpecifics.h @@ -16,7 +16,7 @@ #include //#define CYCLOPS_DEBUG_TIMING -//#define CYCLOPS_GPU_COX_DEBUG_TIMING +//#define CYCLOPS_DEBUG_TIMING_GPU_COX //#define CYCLOPS_DEBUG_TIMING_LOW #ifdef CYCLOPS_DEBUG_TIMING diff --git a/src/cyclops/engine/ModelSpecifics.hpp b/src/cyclops/engine/ModelSpecifics.hpp index 63413e4f..8491689c 100644 --- a/src/cyclops/engine/ModelSpecifics.hpp +++ b/src/cyclops/engine/ModelSpecifics.hpp @@ -789,7 +789,7 @@ void ModelSpecifics::computeGradientAndHessian(int index, do #endif #endif -#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING +#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX duration["CPU GH "] += bsccs::chrono::duration_cast(end - start).count(); #endif } @@ -1619,7 +1619,7 @@ void ModelSpecifics::computeNumeratorForGradient(int index, #endif #endif -#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING +#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX duration["CPU GH "] += bsccs::chrono::duration_cast(end - start).count(); #endif } @@ -1732,7 +1732,7 @@ void ModelSpecifics::updateXBeta(double delta, int index, bo #endif #endif -#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING +#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX duration["CPU GH "] += bsccs::chrono::duration_cast(end - start).count(); #endif @@ -2157,7 +2157,7 @@ void ModelSpecifics::initialize( if (allocateXjX()) { hXjX.resize(J); } -#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING +#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX auto start = std::chrono::steady_clock::now(); #endif if (initializeAccumulationVectors()) { @@ -2167,7 +2167,7 @@ void ModelSpecifics::initialize( // If true, then fill with pointers to CompressedDataColumn and do not delete in destructor setupSparseIndices(N); // Need to be recomputed when hPid change! } -#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING +#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX auto end = std::chrono::steady_clock::now(); double timerPid = std::chrono::duration(end - start).count(); std::cout << " OVERHEAD CCD setPid: " << timerPid << " s \n";