Skip to content

Commit

Permalink
clean up
Browse files Browse the repository at this point in the history
  • Loading branch information
jianxiaoyang committed Oct 25, 2023
1 parent 4e15b30 commit 4e90223
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 40 deletions.
11 changes: 4 additions & 7 deletions src/cyclops/CyclicCoordinateDescent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,6 @@ int CyclicCoordinateDescent::getAlignedLength(int N) {
}

void CyclicCoordinateDescent::computeNEvents() {
//<<<<<<< HEAD
if (syncCV) {
for (int i=0; i<syncCVFolds; i++) {
modelSpecifics.setWeights(
Expand All @@ -268,8 +267,7 @@ void CyclicCoordinateDescent::computeNEvents() {
cWeights.size() > 0 ? cWeights.data() : nullptr,
useCrossValidation);
}
/*
=======
/*
//modelSpecifics.setWeights(
// hWeights.size() > 0 ? hWeights.data() : nullptr,
// useCrossValidation);
Expand All @@ -278,7 +276,6 @@ void CyclicCoordinateDescent::computeNEvents() {
hWeights.size() > 0 ? hWeights.data() : nullptr,
cWeights.size() > 0 ? cWeights.data() : nullptr,
useCrossValidation);
>>>>>>> fine_gray
*/
}

Expand Down Expand Up @@ -1768,7 +1765,7 @@ void CyclicCoordinateDescent::turnOnStreamCV(int foldToCompute) {

streamCV = true;
streamCVFolds = foldToCompute;
std::cout << "foldToCompute: " << foldToCompute << "\n";
// std::cout << "foldToCompute: " << foldToCompute << "\n";
modelSpecifics.turnOnStreamCV(foldToCompute);

}
Expand All @@ -1777,7 +1774,7 @@ void CyclicCoordinateDescent::turnOnSyncCV(int foldToCompute) {

syncCV = true;
syncCVFolds = foldToCompute;
std::cout << "foldToCompute: " << foldToCompute << "\n";
// std::cout << "foldToCompute: " << foldToCompute << "\n";
modelSpecifics.turnOnSyncCV(foldToCompute);
for(int i=0; i<foldToCompute; ++i) {
hBetaPool.push_back(hBeta);
Expand Down Expand Up @@ -1845,7 +1842,7 @@ std::vector<double> CyclicCoordinateDescent::getPredictiveLogLikelihood(std::vec
}
}

std::cout << "iterations: " << lastIterationCount << " ";
// std::cout << "iterations: " << lastIterationCount << " ";
return result;
}

Expand Down
10 changes: 9 additions & 1 deletion src/cyclops/engine/CudaKernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -288,14 +288,15 @@ CudaKernel<RealType, RealType2>::CudaKernel(const std::string& deviceName)
stream = (cudaStream_t *) malloc(sizeof(cudaStream_t));
cudaStreamCreate(&stream[0]);

#ifdef DEBUG_GPU_COX
if (deviceStatus == cudaSuccess) {
std::cout << "ctor CudaKernel on " << deviceName << " stream: " << stream << '\n';
} else if (deviceStatus == cudaErrorDeviceAlreadyInUse) {
std::cout << "cudaErrorDeviceAlreadyInUse \n";
} else if (deviceStatus == cudaErrorInvalidDevice) {
std::cout << "cudaErrorInvalidDevice \n";
}

#endif
}

template <typename RealType, typename RealType2>
Expand All @@ -315,7 +316,10 @@ CudaKernel<RealType, RealType2>::~CudaKernel()
// cudaFree(boundOut);
// cudaFree(temp);
// cudaDeviceReset();

#ifdef DEBUG_GPU_COX
std::cout << "dtor CudaKernel \n";
#endif
}
/*
template <typename RealType, typename RealType2>
Expand Down Expand Up @@ -377,7 +381,9 @@ cudaStream_t* CudaKernel<RealType, RealType2>::getStream() {

template <typename RealType, typename RealType2>
const std::string CudaKernel<RealType, RealType2>::getDeviceName() {
#ifdef DEBUG_GPU_COX
std::cout << "getDeviceName: " << desiredDeviceName << '\n';
#endif
return desiredDeviceName;
}

Expand All @@ -396,7 +402,9 @@ void CudaKernel<RealType, RealType2>::setFold(int currentFold)
if (curIndex != devIndex) {
// TODO: why and where is it set back to the default device?
bool deviceFlag = cudaSetDevice(devIndex);
#ifdef DEBUG_GPU_COX
std::cout << "SET DEVICE TO " << desiredDeviceName << " AGAIN at fold " << fold << '\n';
#endif
}
}

Expand Down
2 changes: 2 additions & 0 deletions src/cyclops/engine/CudaKernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#include <vector>
#include "../CompressedDataMatrix.h"

// #define DEBUG_GPU_COX

typedef typename bsccs::FormatType FormatType;

enum FormatTypeCuda {
Expand Down
69 changes: 43 additions & 26 deletions src/cyclops/engine/GpuModelSpecificsCox.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,20 +37,25 @@ class CudaAllGpuColumns {

CudaAllGpuColumns() {
// Do nothing
#ifdef DEBUG_GPU_COX
std::cerr << "ctor CudaAllGpuColumns" << std::endl;
#endif
}

virtual ~CudaAllGpuColumns() {
#ifdef DEBUG_GPU_COX
std::cerr << "dtor CudaAllGpuColumns" << std::endl;
#endif
}

void initialize(const CompressedDataMatrix<RealType>& mat,
size_t K, bool pad) {

// std::vector<RealType> flatData;
// std::vector<int> flatIndices;

#ifdef DEBUG_GPU_COX
std::cerr << "Cuda AGC start" << std::endl;
#endif

UInt dataStart = 0;
UInt indicesStart = 0;
Expand Down Expand Up @@ -83,13 +88,15 @@ class CudaAllGpuColumns {
taskCounts.push_back(column.getNumberOfEntries());
}
}

#ifdef DEBUG_GPU_COX
std::cerr << "cuda AGC end " << flatData.size() << " " << flatIndices.size() << " " << dataStarts.size() << " " << indicesStarts.size() << " " << taskCounts.size() << std::endl;
#endif
}

void resizeAndCopyColumns (cudaStream_t* stream) {
#ifdef DEBUG_GPU_COX
std::cout << "resizeAndCopyColumns \n";

#endif
resizeAndCopyToDeviceCuda(flatData, data, stream);
resizeAndCopyToDeviceCuda(flatIndices, indices, stream);
resizeAndCopyToDeviceCuda(dataStarts, ddataStarts, stream);
Expand Down Expand Up @@ -238,13 +245,17 @@ class GpuModelSpecificsCox :
dAccNumer(), dAccNumer2(), dDecDenom(), dDecNumer(), dDecNumer2(),
dKWeight(), dNWeight(), dYWeight(),
CoxKernels(deviceName), dCudaColumns(){
#ifdef DEBUG_GPU_COX
std::cerr << "ctor GpuModelSpecificsCox" << std::endl;
#endif
}

virtual ~GpuModelSpecificsCox() {
cudaFree(dGH);
// cudaFreeHost(pGH);
#ifdef DEBUG_GPU_COX
std::cerr << "dtor GpuModelSpecificsCox" << std::endl;
#endif
}

virtual AbstractModelSpecifics* clone(ComputeDeviceArguments computeDevice) const {
Expand All @@ -269,9 +280,9 @@ virtual void setPidForAccumulation(const double* weights) {
}
}
accReset.push_back(K);

#ifdef DEBUG_GPU_COX
std::cerr << "Num of strata: " << accReset.size() << std::endl;

#endif
// copy stratumId from host to device
CoxKernels.resizeAndCopyToDeviceInt(hPidInternal, dPid);
}
Expand All @@ -284,7 +295,7 @@ virtual void deviceInitialization() {
std::cerr << "start dI" << std::endl;
#endif

#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING
#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX
auto start = bsccs::chrono::steady_clock::now();
#endif
// Initialize columns
Expand Down Expand Up @@ -377,9 +388,11 @@ virtual void deviceInitialization() {
dGH,
N);
}

#ifdef DEBUG_GPU_COX
std::cout << "K: " << K << " N: " << N << '\n';
#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING
#endif

#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX
auto end = bsccs::chrono::steady_clock::now();
///////////////////////////"
duration["z cudaDevInit "] += bsccs::chrono::duration_cast<chrono::TimingUnits>(end - start).count();
Expand Down Expand Up @@ -427,13 +440,13 @@ virtual void setWeights(double* inWeights, double *cenWeights, bool useCrossVali
}
}

#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING
#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX
auto start = bsccs::chrono::steady_clock::now();
#endif
// Device
CoxKernels.resizeAndCopyToDevice(hKWeight, dKWeight);
CoxKernels.resizeAndCopyToDevice(hNWeight, dNWeight);
#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING
#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX
auto end = bsccs::chrono::steady_clock::now();
///////////////////////////"
duration["z Data transfer "] += bsccs::chrono::duration_cast<chrono::TimingUnits>(end - start).count();
Expand All @@ -450,12 +463,12 @@ virtual void setWeights(double* inWeights, double *cenWeights, bool useCrossVali
hYWeight[k] = cenWeights[k];
hYWeightDouble[k] = cenWeights[k];
}
#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING
#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX
auto start = bsccs::chrono::steady_clock::now();
#endif
// Device
CoxKernels.resizeAndCopyToDevice(hYWeight, dYWeight);
#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING
#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX
auto end = bsccs::chrono::steady_clock::now();
///////////////////////////"
duration["z Data transfer "] += bsccs::chrono::duration_cast<chrono::TimingUnits>(end - start).count();
Expand All @@ -467,12 +480,12 @@ virtual void setWeights(double* inWeights, double *cenWeights, bool useCrossVali
virtual void computeFixedTermsInGradientAndHessian(bool useCrossValidation) {

ModelSpecifics<BaseModel,RealType>::computeFixedTermsInGradientAndHessian(useCrossValidation);
#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING
#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX
auto start = bsccs::chrono::steady_clock::now();
#endif
// resizeAndCopyToDeviceCuda(hXjY, dXjY);
CoxKernels.resizeAndCopyToDevice(hXjY, dXjY);
#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING
#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX
auto end = bsccs::chrono::steady_clock::now();
///////////////////////////"
duration["z Data transfer "] += bsccs::chrono::duration_cast<chrono::TimingUnits>(end - start).count();
Expand Down Expand Up @@ -527,7 +540,7 @@ virtual void computeRemainingStatistics(bool useWeights) {
}
}

#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING
#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX
auto start = bsccs::chrono::steady_clock::now();
#endif
// Device
Expand All @@ -542,7 +555,7 @@ virtual void computeRemainingStatistics(bool useWeights) {
CoxKernels.copyFromHostToDevice(denomPid, dDenominator);
// CoxKernels.copyFromHostToDevice(accDenomPid, dAccDenom);

#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING
#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX
auto end = bsccs::chrono::steady_clock::now();
///////////////////////////"
duration["z Data transfer "] += bsccs::chrono::duration_cast<chrono::TimingUnits>(end - start).count();;
Expand All @@ -565,11 +578,11 @@ virtual double getLogLikelihood(bool useCrossValidation) {
} else {
CoxKernels.computeAccumlatedDenominator(dDenominator, dAccDenom, K);
}
#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING
#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX
auto start0 = bsccs::chrono::steady_clock::now();
#endif
CoxKernels.copyFromDeviceToHost(dAccDenom, accDenomPid);
#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING
#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX
auto end0 = bsccs::chrono::steady_clock::now();
///////////////////////////"
duration["z Data transfer "] += bsccs::chrono::duration_cast<chrono::TimingUnits>(end0 - start0).count();;
Expand Down Expand Up @@ -812,21 +825,21 @@ virtual void updateBetaAndDelta(int index, bool useWeights) {
///////////////////////////"
duration["updateXBetaG "] += bsccs::chrono::duration_cast<chrono::TimingUnits>(end4 - start4).count();
#endif
#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING
#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX
duration["GPU GH "] += bsccs::chrono::duration_cast<chrono::TimingUnits>(end - start + end2 - start2 + end4 - start4).count();
#endif
}

virtual const std::vector<double> getXBeta() {

#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING
#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX
auto start = bsccs::chrono::steady_clock::now();
#endif
if (!hXBetaKnown) {
CoxKernels.copyFromDeviceToHost(dXBeta, hXBeta);
hXBetaKnown = true;
}
#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING
#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX
auto end = bsccs::chrono::steady_clock::now();
///////////////////////////"
duration["z Data transfer "] += bsccs::chrono::duration_cast<chrono::TimingUnits>(end - start).count();;
Expand All @@ -839,14 +852,14 @@ virtual const std::vector<double> getXBetaSave() {
}

virtual void saveXBeta() {
#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING
#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX
auto start = bsccs::chrono::steady_clock::now();
#endif
if (!hXBetaKnown) {
CoxKernels.copyFromDeviceToHost(dXBeta, hXBeta);
hXBetaKnown = true;
}
#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING
#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX
auto end = bsccs::chrono::steady_clock::now();
///////////////////////////"
duration["z Data transfer "] += bsccs::chrono::duration_cast<chrono::TimingUnits>(end - start).count();;
Expand All @@ -867,15 +880,15 @@ virtual void axpyXBeta(const double beta, const int j) {
}

virtual std::vector<double> getBeta() {
#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING
#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX
auto start = bsccs::chrono::steady_clock::now();
#endif
CoxKernels.copyFromDeviceToDevice(dBound, dBoundBuffer);
CoxKernels.copyFromDeviceToDevice(dBeta, dBetaBuffer);
CoxKernels.copyFromDeviceToHost(dBeta, RealHBeta);
// CoxKernels.getBeta(RealHBeta);
// CoxKernels.getBound();
#ifdef CYCLOPS_GPU_COX_DEBUG_TIMING
#ifdef CYCLOPS_DEBUG_TIMING_GPU_COX
auto end = bsccs::chrono::steady_clock::now();
///////////////////////////"
duration["z Data transfer "] += bsccs::chrono::duration_cast<chrono::TimingUnits>(end - start).count();;
Expand Down Expand Up @@ -921,13 +934,17 @@ void turnOnStreamCV(int foldToCompute) {
streamCV = true;
streamCVFolds = foldToCompute;
CoxKernels.allocStreams(streamCVFolds);
#ifdef DEBUG_GPU_COX
std::cout << "GPUMS streamCVFolds: " << streamCVFolds << '\n';
#endif
}

void setFold(int inFold){
fold = inFold;
CoxKernels.setFold(inFold);
// std::cout << "GPUMS current fold: " << fold << '\n';
#ifdef DEBUG_GPU_COX
std::cout << "GPUMS current fold: " << fold << '\n';
#endif
}

private:
Expand Down
2 changes: 1 addition & 1 deletion src/cyclops/engine/ModelSpecifics.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#include <complex>

//#define CYCLOPS_DEBUG_TIMING
//#define CYCLOPS_GPU_COX_DEBUG_TIMING
//#define CYCLOPS_DEBUG_TIMING_GPU_COX
//#define CYCLOPS_DEBUG_TIMING_LOW

#ifdef CYCLOPS_DEBUG_TIMING
Expand Down
Loading

0 comments on commit 4e90223

Please sign in to comment.