Skip to content

Commit

Permalink
use unified external error message for cufft api (#36114)
Browse files Browse the repository at this point in the history
  • Loading branch information
cxxly authored Oct 11, 2021
1 parent 2bf82e7 commit 642aaa2
Show file tree
Hide file tree
Showing 8 changed files with 92 additions and 15 deletions.
4 changes: 2 additions & 2 deletions cmake/third_party.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -251,8 +251,8 @@ if(WITH_GPU)
include(external/cub) # download cub
list(APPEND third_party_deps extern_cub)
endif()
set(URL "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg.tar.gz" CACHE STRING "" FORCE)
file_download_and_uncompress(${URL} "externalError" MD5 061f3b7895aadcbe2c3ed592590f8b10) # download file externalErrorMsg.tar.gz
set(URL "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg_20210928.tar.gz" CACHE STRING "" FORCE)
file_download_and_uncompress(${URL} "externalError" MD5 a712a49384e77ca216ad866712f7cafa) # download file externalErrorMsg.tar.gz
if(WITH_TESTING)
# copy externalErrorMsg.pb, just for unittest can get error message correctly.
set(SRC_DIR ${THIRD_PARTY_PATH}/externalError/data)
Expand Down
5 changes: 2 additions & 3 deletions paddle/fluid/operators/spectral_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,7 @@ static inline std::string get_cufft_error_info(cufftResult error) {
}

static inline void CUFFT_CHECK(cufftResult error) {
if (error != CUFFT_SUCCESS) {
PADDLE_THROW(platform::errors::External(get_cufft_error_info(error)));
}
PADDLE_ENFORCE_CUDA_SUCCESS(error);
}

// This struct is used to easily compute hashes of the
Expand Down Expand Up @@ -413,6 +411,7 @@ void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out,
? framework::ToRealType(input.type())
: input.type();
auto fft_type = GetFFTTransformType(input.type(), output.type());

PlanKey Key(framework::vectorize(input.dims()),
framework::vectorize(output.dims()), signal_size, fft_type,
value_type);
Expand Down
14 changes: 14 additions & 0 deletions paddle/fluid/platform/enforce.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#include <cublas_v2.h>
#include <cudnn.h>
#include <cufft.h>
#include <curand.h>
#include <thrust/system/cuda/error.h>
#include <thrust/system_error.h>
Expand Down Expand Up @@ -714,6 +715,7 @@ DEFINE_EXTERNAL_API_TYPE(curandStatus_t, CURAND_STATUS_SUCCESS, CURAND);
DEFINE_EXTERNAL_API_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS, CUDNN);
DEFINE_EXTERNAL_API_TYPE(cublasStatus_t, CUBLAS_STATUS_SUCCESS, CUBLAS);
DEFINE_EXTERNAL_API_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS, CUSOLVER);
DEFINE_EXTERNAL_API_TYPE(cufftResult_t, CUFFT_SUCCESS, CUFFT);

#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess, NCCL);
Expand Down Expand Up @@ -751,6 +753,8 @@ inline const char* GetErrorMsgUrl(T status) {
return "https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/"
"types.html#ncclresult-t";
break;
case platform::proto::ApiType::CUFFT:
return "https://docs.nvidia.com/cuda/cufft/index.html#cufftresult";
default:
return "Unknown type of External API, can't get error message URL!";
break;
Expand Down Expand Up @@ -839,6 +843,7 @@ template std::string GetExternalErrorMsg<curandStatus_t>(curandStatus_t);
template std::string GetExternalErrorMsg<cudnnStatus_t>(cudnnStatus_t);
template std::string GetExternalErrorMsg<cublasStatus_t>(cublasStatus_t);
template std::string GetExternalErrorMsg<cusolverStatus_t>(cusolverStatus_t);
template std::string GetExternalErrorMsg<cufftResult_t>(cufftResult_t);
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
template std::string GetExternalErrorMsg<ncclResult_t>(ncclResult_t);
#endif
Expand Down Expand Up @@ -899,6 +904,15 @@ inline std::string build_nvidia_error_msg(cusolverStatus_t stat) {
return sout.str();
}

/*************** CUFFT ERROR ***************/
inline bool is_error(cufftResult_t stat) { return stat != CUFFT_SUCCESS; }

inline std::string build_nvidia_error_msg(cufftResult_t stat) {
std::ostringstream sout;
sout << "CUFFT error(" << stat << "). " << GetExternalErrorMsg(stat);
return sout.str();
}

/**************** NCCL ERROR ****************/
#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
inline bool is_error(ncclResult_t nccl_result) {
Expand Down
22 changes: 21 additions & 1 deletion paddle/fluid/platform/enforce_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/platform/enforce.h"

#include <list>

#include "gtest/gtest.h"
#include "paddle/fluid/platform/enforce.h"

TEST(ENFORCE, OK) {
PADDLE_ENFORCE(true, paddle::platform::errors::Unavailable(
Expand Down Expand Up @@ -418,6 +419,25 @@ TEST(enforce, cuda_success) {
"negative vector size, for example).To correct: ensure that all the "
"parameters being passed have valid values"));

EXPECT_TRUE(CheckCudaStatusSuccess(CUFFT_SUCCESS));
EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INVALID_PLAN, "CUFFT error"));
EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_ALLOC_FAILED, "CUFFT error"));
EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INVALID_TYPE, "CUFFT error"));
EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INVALID_VALUE, "CUFFT error"));
EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INTERNAL_ERROR, "CUFFT error"));
EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_EXEC_FAILED, "CUFFT error"));
EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_SETUP_FAILED, "CUFFT error"));
EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INVALID_SIZE, "CUFFT error"));
EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_UNALIGNED_DATA, "CUFFT error"));
EXPECT_TRUE(
CheckCudaStatusFailure(CUFFT_INCOMPLETE_PARAMETER_LIST, "CUFFT error"));
EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INVALID_DEVICE, "CUFFT error"));
EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_PARSE_ERROR, "CUFFT error"));
EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_NO_WORKSPACE, "CUFFT error"));
EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_NOT_IMPLEMENTED, "CUFFT error"));
EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_LICENSE_ERROR, "CUFFT error"));
EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_NOT_SUPPORTED, "CUFFT error"));

#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "NCCL error"));
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/platform/external_error.proto
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ enum ApiType {
CUBLAS = 3;
CUSOLVER = 4;
NCCL = 5;
CUFFT = 6;
}

message MessageDesc {
Expand Down
30 changes: 23 additions & 7 deletions tools/externalError/README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,25 @@
Usage:
#### **Introduction for crawling new error message:**

Please run:
```
bash start.sh
```

If you want to update all external error message, you need to run command `bash start.sh` in current directory,
and upload the generated file `externalErrorMsg.tar.gz` to https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg.tar.gz

1. add new spider code in spider.py for crawling error message from website.

2. run `bash start.sh` in current directory to generate new externalErrorMsg_${date}.tar.gz file, for example `externalErrorMsg_20210928.tar.gz`.

3. upload above tar file into bos https://paddlepaddledeps.bj.bcebos.com **paddlepaddledeps** bucket, and copy download link `${download_url}`. ***\*Be careful not to delete original tar file\****.

4. compute md5 value of above tar file `${md5}`, and modify cmake/third_party.cmake file

```
set(URL "${download_url}" CACHE STRING "" FORCE)
file_download_and_uncompress(${URL} "externalError" MD5 ${md5})
```

for example:

```
set(URL "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg_20210928.tar.gz" CACHE STRING "" FORCE)
file_download_and_uncompress(${URL} "externalError" MD5 a712a49384e77ca216ad866712f7cafa)
```

5. commit your changes, and create pull request.
29 changes: 28 additions & 1 deletion tools/externalError/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@
import urllib.request
import json
import collections
import sys, getopt
import sys
import getopt
import external_error_pb2
from html.parser import HTMLParser


def parsing(externalErrorDesc):
Expand Down Expand Up @@ -335,6 +337,31 @@ def parsing(externalErrorDesc):
_Messages.message = "'%s'. %s" % (error[0], m_message)
print("End crawling errorMessage for nvidia NCCL API!\n")

#*************************************************************************************************#
#*********************************** CUFFT Error Message **************************************#
print("start crawling errorMessage for nvidia CUFFT API--->")
url = 'https://docs.nvidia.com/cuda/cufft/index.html#cufftresult'

allMessageDesc = externalErrorDesc.errors.add()
allMessageDesc.type = external_error_pb2.CUFFT

html = urllib.request.urlopen(url).read().decode('utf-8')

class CUFFTHTMLParser(HTMLParser):
'''CUFFTHTML Parser
'''

def handle_data(self, data):
if 'typedef enum cufftResult_t' in data:
for line in data.strip().splitlines()[1:-1]:
status, code, desc = re.split('=|//', line.strip())
_Messages = allMessageDesc.messages.add()
_Messages.code = int(code.strip(' ,'))
_Messages.message = "'%s'. %s" % (status.strip(),
desc.strip())

CUFFTHTMLParser().feed(html)


def main(argv):
try:
Expand Down
2 changes: 1 addition & 1 deletion tools/externalError/start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,4 @@ fi
protobuf/bin/protoc -I../../paddle/fluid/platform/ --python_out . ../../paddle/fluid/platform/external_error.proto

python3.7 spider.py
tar czvf externalErrorMsg.tar.gz externalErrorMsg.pb
tar czvf externalErrorMsg_$(date +'%Y%m%d').tar.gz externalErrorMsg.pb

0 comments on commit 642aaa2

Please sign in to comment.