From 96c8f727aa3a0947234e60eebdf4c5d1d72c7445 Mon Sep 17 00:00:00 2001 From: Qiongsi Wu Date: Wed, 1 Aug 2018 16:26:58 -0400 Subject: [PATCH 01/15] Commit to make the branck compile --- CMakeLists.txt | 24 +-- Problem Sets/Problem Set 1/Makefile | 256 ++++++++++++++++++++---- Problem Sets/Problem Set 2/Makefile | 256 +++++++++++++++++++++--- Problem Sets/Problem Set 3/Makefile | 288 +++++++++++++++++++++++---- Problem Sets/Problem Set 4/Makefile | 292 +++++++++++++++++++++++---- Problem Sets/Problem Set 5/Makefile | 184 ++++++++++++++++- Problem Sets/Problem Set 5/main.cu | 2 +- Problem Sets/Problem Set 6/Makefile | 293 ++++++++++++++++++++++++---- 8 files changed, 1385 insertions(+), 210 deletions(-) mode change 100755 => 100644 Problem Sets/Problem Set 1/Makefile mode change 100755 => 100644 Problem Sets/Problem Set 2/Makefile mode change 100755 => 100644 Problem Sets/Problem Set 3/Makefile mode change 100755 => 100644 Problem Sets/Problem Set 4/Makefile mode change 100755 => 100644 Problem Sets/Problem Set 5/Makefile mode change 100755 => 100644 Problem Sets/Problem Set 6/Makefile diff --git a/CMakeLists.txt b/CMakeLists.txt index 996cb1c3..2627178a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,27 +24,27 @@ if(CUDA_FOUND) -ccbin /usr/bin/clang; -gencode;arch=compute_30,code=sm_30; -gencode;arch=compute_35,code=sm_35; - -gencode;arch=compute_35,code=compute_35; - -gencode;arch=compute_20,code=sm_20; - -gencode;arch=compute_11,code=sm_11; - -gencode;arch=compute_12,code=sm_12; - -gencode;arch=compute_13,code=sm_13;") + -gencode;arch=compute_35,code=compute_35;") + #-gencode;arch=compute_20,code=sm_20; + #-gencode;arch=compute_11,code=sm_11; + #-gencode;arch=compute_12,code=sm_12; + #-gencode;arch=compute_13,code=sm_13;") # add -Wextra compiler flag for gcc compilations if (UNIX) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler -Wextra") - set(CMAKE_CXX_FLAGS "-stdlib=libstdc++") + #set(CMAKE_CXX_FLAGS "-stdlib=libstdc++") endif (UNIX) # add debugging to CUDA NVCC flags. For NVidia's NSight tools. set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG} "-G") - add_subdirectory (HW1) - add_subdirectory (HW2) - add_subdirectory (HW3) - add_subdirectory (HW4) - add_subdirectory (HW5) - add_subdirectory (HW6) + add_subdirectory ("Problem Sets/Problem Set 1") + add_subdirectory ("Problem Sets/Problem Set 2") + add_subdirectory ("Problem Sets/Problem Set 3") + add_subdirectory ("Problem Sets/Problem Set 4") + add_subdirectory ("Problem Sets/Problem Set 5") + add_subdirectory ("Problem Sets/Problem Set 6") else(CUDA_FOUND) message("CUDA is not installed on this system.") endif() diff --git a/Problem Sets/Problem Set 1/Makefile b/Problem Sets/Problem Set 1/Makefile old mode 100755 new mode 100644 index bc0b2e5d..525ffba0 --- a/Problem Sets/Problem Set 1/Makefile +++ b/Problem Sets/Problem Set 1/Makefile @@ -1,54 +1,240 @@ -NVCC=nvcc +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 3.5 -################################### -# These are the default install # -# locations on most linux distros # -################################### +# Default target executed when no arguments are given to make. +default_target: all -OPENCV_LIBPATH=/usr/lib -OPENCV_INCLUDEPATH=/usr/include +.PHONY : default_target -################################################### -# On Macs the default install locations are below # -################################################### +# Allow only one "make -f Makefile2" at a time, but pass parallelism. +.NOTPARALLEL: -#OPENCV_LIBPATH=/usr/local/lib -#OPENCV_INCLUDEPATH=/usr/local/include -# or if using MacPorts +#============================================================================= +# Special targets provided by cmake. -#OPENCV_LIBPATH=/opt/local/lib -#OPENCV_INCLUDEPATH=/opt/local/include +# Disable implicit rules so canonical targets will work. +.SUFFIXES: -OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui -CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include +# Remove some rules from gmake that .SUFFIXES does not remove. +SUFFIXES = -###################################################### -# On Macs the default install locations are below # -# #################################################### +.SUFFIXES: .hpux_make_needs_suffix_list -#CUDA_INCLUDEPATH=/usr/local/cuda/include -#CUDA_LIBPATH=/usr/local/cuda/lib -NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64 +# Suppress display of executed commands. +$(VERBOSE).SILENT: -GCC_OPTS=-O3 -Wall -Wextra -m64 -student: main.o student_func.o compare.o reference_calc.o Makefile - $(NVCC) -o HW1 main.o student_func.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS) +# A target that is always out of date. +cmake_force: -main.o: main.cpp timer.h utils.h reference_calc.cpp compare.cpp HW1.cpp - g++ -c main.cpp $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) -I $(OPENCV_INCLUDEPATH) +.PHONY : cmake_force -student_func.o: student_func.cu utils.h - nvcc -c student_func.cu $(NVCC_OPTS) +#============================================================================= +# Set environment variables for the build. -compare.o: compare.cpp compare.h - g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) +# The shell in which to execute make rules. +SHELL = /bin/sh -reference_calc.o: reference_calc.cpp reference_calc.h - g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) +# The CMake executable. +CMAKE_COMMAND = /usr/bin/cmake +# The command to remove a file. +RM = /usr/bin/cmake -E remove -f + +# Escaping for special characters. +EQUALS = = + +# The top-level source directory on which CMake was run. +CMAKE_SOURCE_DIR = /home/wuqiongs/cs344 + +# The top-level build directory on which CMake was run. +CMAKE_BINARY_DIR = /home/wuqiongs/cs344 + +#============================================================================= +# Targets provided globally by CMake. + +# Special rule for the target edit_cache +edit_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..." + /usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available. +.PHONY : edit_cache + +# Special rule for the target edit_cache +edit_cache/fast: edit_cache + +.PHONY : edit_cache/fast + +# Special rule for the target rebuild_cache +rebuild_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..." + /usr/bin/cmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : rebuild_cache + +# Special rule for the target rebuild_cache +rebuild_cache/fast: rebuild_cache + +.PHONY : rebuild_cache/fast + +# The main all target +all: cmake_check_build_system + cd /home/wuqiongs/cs344 && $(CMAKE_COMMAND) -E cmake_progress_start /home/wuqiongs/cs344/CMakeFiles "/home/wuqiongs/cs344/Problem Sets/Problem Set 1/CMakeFiles/progress.marks" + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 1/all" + $(CMAKE_COMMAND) -E cmake_progress_start /home/wuqiongs/cs344/CMakeFiles 0 +.PHONY : all + +# The main clean target clean: - rm -f *.o *.png hw + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 1/clean" +.PHONY : clean + +# The main clean target +clean/fast: clean + +.PHONY : clean/fast + +# Prepare targets for installation. +preinstall: all + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 1/preinstall" +.PHONY : preinstall + +# Prepare targets for installation. +preinstall/fast: + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 1/preinstall" +.PHONY : preinstall/fast + +# clear depends +depend: + cd /home/wuqiongs/cs344 && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1 +.PHONY : depend + +# Convenience name for target. +Problem\ Sets/Problem\ Set\ 1/CMakeFiles/HW1.dir/rule: + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 1/CMakeFiles/HW1.dir/rule" +.PHONY : Problem\ Sets/Problem\ Set\ 1/CMakeFiles/HW1.dir/rule + +# Convenience name for target. +HW1: Problem\ Sets/Problem\ Set\ 1/CMakeFiles/HW1.dir/rule + +.PHONY : HW1 + +# fast build rule for target. +HW1/fast: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 1/CMakeFiles/HW1.dir/build.make" "Problem Sets/Problem Set 1/CMakeFiles/HW1.dir/build" +.PHONY : HW1/fast + +compare.o: compare.cpp.o + +.PHONY : compare.o + +# target to build an object file +compare.cpp.o: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 1/CMakeFiles/HW1.dir/build.make" "Problem Sets/Problem Set 1/CMakeFiles/HW1.dir/compare.cpp.o" +.PHONY : compare.cpp.o + +compare.i: compare.cpp.i + +.PHONY : compare.i + +# target to preprocess a source file +compare.cpp.i: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 1/CMakeFiles/HW1.dir/build.make" "Problem Sets/Problem Set 1/CMakeFiles/HW1.dir/compare.cpp.i" +.PHONY : compare.cpp.i + +compare.s: compare.cpp.s + +.PHONY : compare.s + +# target to generate assembly for a file +compare.cpp.s: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 1/CMakeFiles/HW1.dir/build.make" "Problem Sets/Problem Set 1/CMakeFiles/HW1.dir/compare.cpp.s" +.PHONY : compare.cpp.s + +main.o: main.cpp.o + +.PHONY : main.o + +# target to build an object file +main.cpp.o: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 1/CMakeFiles/HW1.dir/build.make" "Problem Sets/Problem Set 1/CMakeFiles/HW1.dir/main.cpp.o" +.PHONY : main.cpp.o + +main.i: main.cpp.i + +.PHONY : main.i + +# target to preprocess a source file +main.cpp.i: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 1/CMakeFiles/HW1.dir/build.make" "Problem Sets/Problem Set 1/CMakeFiles/HW1.dir/main.cpp.i" +.PHONY : main.cpp.i + +main.s: main.cpp.s + +.PHONY : main.s + +# target to generate assembly for a file +main.cpp.s: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 1/CMakeFiles/HW1.dir/build.make" "Problem Sets/Problem Set 1/CMakeFiles/HW1.dir/main.cpp.s" +.PHONY : main.cpp.s + +reference_calc.o: reference_calc.cpp.o + +.PHONY : reference_calc.o + +# target to build an object file +reference_calc.cpp.o: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 1/CMakeFiles/HW1.dir/build.make" "Problem Sets/Problem Set 1/CMakeFiles/HW1.dir/reference_calc.cpp.o" +.PHONY : reference_calc.cpp.o + +reference_calc.i: reference_calc.cpp.i + +.PHONY : reference_calc.i + +# target to preprocess a source file +reference_calc.cpp.i: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 1/CMakeFiles/HW1.dir/build.make" "Problem Sets/Problem Set 1/CMakeFiles/HW1.dir/reference_calc.cpp.i" +.PHONY : reference_calc.cpp.i + +reference_calc.s: reference_calc.cpp.s + +.PHONY : reference_calc.s + +# target to generate assembly for a file +reference_calc.cpp.s: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 1/CMakeFiles/HW1.dir/build.make" "Problem Sets/Problem Set 1/CMakeFiles/HW1.dir/reference_calc.cpp.s" +.PHONY : reference_calc.cpp.s + +# Help Target +help: + @echo "The following are some of the valid targets for this Makefile:" + @echo "... all (the default if no target is provided)" + @echo "... clean" + @echo "... depend" + @echo "... edit_cache" + @echo "... HW1" + @echo "... rebuild_cache" + @echo "... compare.o" + @echo "... compare.i" + @echo "... compare.s" + @echo "... main.o" + @echo "... main.i" + @echo "... main.s" + @echo "... reference_calc.o" + @echo "... reference_calc.i" + @echo "... reference_calc.s" +.PHONY : help + + + +#============================================================================= +# Special targets to cleanup operation of make. + +# Special rule to run CMake to check the build system integrity. +# No rule that depends on this can have commands that come from listfiles +# because they might be regenerated. +cmake_check_build_system: + cd /home/wuqiongs/cs344 && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0 +.PHONY : cmake_check_build_system + diff --git a/Problem Sets/Problem Set 2/Makefile b/Problem Sets/Problem Set 2/Makefile old mode 100755 new mode 100644 index ac6c04ed..e6233dc1 --- a/Problem Sets/Problem Set 2/Makefile +++ b/Problem Sets/Problem Set 2/Makefile @@ -1,48 +1,240 @@ -NVCC=nvcc +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 3.5 -################################### -# These are the default install # -# locations on most linux distros # -################################### +# Default target executed when no arguments are given to make. +default_target: all -OPENCV_LIBPATH=/usr/lib -OPENCV_INCLUDEPATH=/usr/include +.PHONY : default_target -################################################### -# On Macs the default install locations are below # -################################################### +# Allow only one "make -f Makefile2" at a time, but pass parallelism. +.NOTPARALLEL: -#OPENCV_LIBPATH=/usr/local/lib -#OPENCV_INCLUDEPATH=/usr/local/include -OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui -CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include +#============================================================================= +# Special targets provided by cmake. -###################################################### -# On Macs the default install locations are below # -# #################################################### +# Disable implicit rules so canonical targets will work. +.SUFFIXES: -#CUDA_INCLUDEPATH=/usr/local/cuda/include -#CUDA_LIBPATH=/usr/local/cuda/lib -NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64 +# Remove some rules from gmake that .SUFFIXES does not remove. +SUFFIXES = -GCC_OPTS=-O3 -Wall -Wextra -m64 +.SUFFIXES: .hpux_make_needs_suffix_list -student: main.o student_func.o compare.o reference_calc.o Makefile - $(NVCC) -o HW2 main.o student_func.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS) -main.o: main.cpp timer.h utils.h HW2.cpp - g++ -c main.cpp $(GCC_OPTS) -I $(OPENCV_INCLUDEPATH) -I $(CUDA_INCLUDEPATH) +# Suppress display of executed commands. +$(VERBOSE).SILENT: -student_func.o: student_func.cu reference_calc.cpp utils.h - nvcc -c student_func.cu $(NVCC_OPTS) -compare.o: compare.cpp compare.h - g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) +# A target that is always out of date. +cmake_force: -reference_calc.o: reference_calc.cpp reference_calc.h - g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) +.PHONY : cmake_force +#============================================================================= +# Set environment variables for the build. + +# The shell in which to execute make rules. +SHELL = /bin/sh + +# The CMake executable. +CMAKE_COMMAND = /usr/bin/cmake + +# The command to remove a file. +RM = /usr/bin/cmake -E remove -f + +# Escaping for special characters. +EQUALS = = + +# The top-level source directory on which CMake was run. +CMAKE_SOURCE_DIR = /home/wuqiongs/cs344 + +# The top-level build directory on which CMake was run. +CMAKE_BINARY_DIR = /home/wuqiongs/cs344 + +#============================================================================= +# Targets provided globally by CMake. + +# Special rule for the target edit_cache +edit_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..." + /usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available. +.PHONY : edit_cache + +# Special rule for the target edit_cache +edit_cache/fast: edit_cache + +.PHONY : edit_cache/fast + +# Special rule for the target rebuild_cache +rebuild_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..." + /usr/bin/cmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : rebuild_cache + +# Special rule for the target rebuild_cache +rebuild_cache/fast: rebuild_cache + +.PHONY : rebuild_cache/fast + +# The main all target +all: cmake_check_build_system + cd /home/wuqiongs/cs344 && $(CMAKE_COMMAND) -E cmake_progress_start /home/wuqiongs/cs344/CMakeFiles "/home/wuqiongs/cs344/Problem Sets/Problem Set 2/CMakeFiles/progress.marks" + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 2/all" + $(CMAKE_COMMAND) -E cmake_progress_start /home/wuqiongs/cs344/CMakeFiles 0 +.PHONY : all + +# The main clean target clean: - rm -f *.o *.png hw + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 2/clean" +.PHONY : clean + +# The main clean target +clean/fast: clean + +.PHONY : clean/fast + +# Prepare targets for installation. +preinstall: all + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 2/preinstall" +.PHONY : preinstall + +# Prepare targets for installation. +preinstall/fast: + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 2/preinstall" +.PHONY : preinstall/fast + +# clear depends +depend: + cd /home/wuqiongs/cs344 && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1 +.PHONY : depend + +# Convenience name for target. +Problem\ Sets/Problem\ Set\ 2/CMakeFiles/HW2.dir/rule: + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 2/CMakeFiles/HW2.dir/rule" +.PHONY : Problem\ Sets/Problem\ Set\ 2/CMakeFiles/HW2.dir/rule + +# Convenience name for target. +HW2: Problem\ Sets/Problem\ Set\ 2/CMakeFiles/HW2.dir/rule + +.PHONY : HW2 + +# fast build rule for target. +HW2/fast: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 2/CMakeFiles/HW2.dir/build.make" "Problem Sets/Problem Set 2/CMakeFiles/HW2.dir/build" +.PHONY : HW2/fast + +compare.o: compare.cpp.o + +.PHONY : compare.o + +# target to build an object file +compare.cpp.o: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 2/CMakeFiles/HW2.dir/build.make" "Problem Sets/Problem Set 2/CMakeFiles/HW2.dir/compare.cpp.o" +.PHONY : compare.cpp.o + +compare.i: compare.cpp.i + +.PHONY : compare.i + +# target to preprocess a source file +compare.cpp.i: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 2/CMakeFiles/HW2.dir/build.make" "Problem Sets/Problem Set 2/CMakeFiles/HW2.dir/compare.cpp.i" +.PHONY : compare.cpp.i + +compare.s: compare.cpp.s + +.PHONY : compare.s + +# target to generate assembly for a file +compare.cpp.s: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 2/CMakeFiles/HW2.dir/build.make" "Problem Sets/Problem Set 2/CMakeFiles/HW2.dir/compare.cpp.s" +.PHONY : compare.cpp.s + +main.o: main.cpp.o + +.PHONY : main.o + +# target to build an object file +main.cpp.o: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 2/CMakeFiles/HW2.dir/build.make" "Problem Sets/Problem Set 2/CMakeFiles/HW2.dir/main.cpp.o" +.PHONY : main.cpp.o + +main.i: main.cpp.i + +.PHONY : main.i + +# target to preprocess a source file +main.cpp.i: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 2/CMakeFiles/HW2.dir/build.make" "Problem Sets/Problem Set 2/CMakeFiles/HW2.dir/main.cpp.i" +.PHONY : main.cpp.i + +main.s: main.cpp.s + +.PHONY : main.s + +# target to generate assembly for a file +main.cpp.s: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 2/CMakeFiles/HW2.dir/build.make" "Problem Sets/Problem Set 2/CMakeFiles/HW2.dir/main.cpp.s" +.PHONY : main.cpp.s + +reference_calc.o: reference_calc.cpp.o + +.PHONY : reference_calc.o + +# target to build an object file +reference_calc.cpp.o: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 2/CMakeFiles/HW2.dir/build.make" "Problem Sets/Problem Set 2/CMakeFiles/HW2.dir/reference_calc.cpp.o" +.PHONY : reference_calc.cpp.o + +reference_calc.i: reference_calc.cpp.i + +.PHONY : reference_calc.i + +# target to preprocess a source file +reference_calc.cpp.i: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 2/CMakeFiles/HW2.dir/build.make" "Problem Sets/Problem Set 2/CMakeFiles/HW2.dir/reference_calc.cpp.i" +.PHONY : reference_calc.cpp.i + +reference_calc.s: reference_calc.cpp.s + +.PHONY : reference_calc.s + +# target to generate assembly for a file +reference_calc.cpp.s: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 2/CMakeFiles/HW2.dir/build.make" "Problem Sets/Problem Set 2/CMakeFiles/HW2.dir/reference_calc.cpp.s" +.PHONY : reference_calc.cpp.s + +# Help Target +help: + @echo "The following are some of the valid targets for this Makefile:" + @echo "... all (the default if no target is provided)" + @echo "... clean" + @echo "... depend" + @echo "... edit_cache" + @echo "... HW2" + @echo "... rebuild_cache" + @echo "... compare.o" + @echo "... compare.i" + @echo "... compare.s" + @echo "... main.o" + @echo "... main.i" + @echo "... main.s" + @echo "... reference_calc.o" + @echo "... reference_calc.i" + @echo "... reference_calc.s" +.PHONY : help + + + +#============================================================================= +# Special targets to cleanup operation of make. + +# Special rule to run CMake to check the build system integrity. +# No rule that depends on this can have commands that come from listfiles +# because they might be regenerated. +cmake_check_build_system: + cd /home/wuqiongs/cs344 && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0 +.PHONY : cmake_check_build_system + diff --git a/Problem Sets/Problem Set 3/Makefile b/Problem Sets/Problem Set 3/Makefile old mode 100755 new mode 100644 index 65e7a938..b89747bc --- a/Problem Sets/Problem Set 3/Makefile +++ b/Problem Sets/Problem Set 3/Makefile @@ -1,56 +1,270 @@ -NVCC=nvcc +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 3.5 -################################### -# These are the default install # -# locations on most linux distros # -################################### +# Default target executed when no arguments are given to make. +default_target: all -OPENCV_LIBPATH=/usr/lib -OPENCV_INCLUDEPATH=/usr/include +.PHONY : default_target -################################################### -# On Macs the default install locations are below # -################################################### +# Allow only one "make -f Makefile2" at a time, but pass parallelism. +.NOTPARALLEL: -#OPENCV_LIBPATH=/usr/local/lib -#OPENCV_INCLUDEPATH=/usr/local/include -OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui +#============================================================================= +# Special targets provided by cmake. -CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include +# Disable implicit rules so canonical targets will work. +.SUFFIXES: -###################################################### -# On Macs the default install locations are below # -# #################################################### -#CUDA_INCLUDEPATH=/usr/local/cuda/include -#CUDA_LIBPATH=/usr/local/cuda/lib +# Remove some rules from gmake that .SUFFIXES does not remove. +SUFFIXES = -NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64 +.SUFFIXES: .hpux_make_needs_suffix_list -GCC_OPTS=-O3 -Wall -Wextra -m64 -student: main.o student_func.o HW3.o loadSaveImage.o compare.o reference_calc.o Makefile - $(NVCC) -o HW3 main.o student_func.o HW3.o loadSaveImage.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS) +# Suppress display of executed commands. +$(VERBOSE).SILENT: -main.o: main.cpp timer.h utils.h reference_calc.h compare.h - g++ -c main.cpp $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) -HW3.o: HW3.cu loadSaveImage.h utils.h - $(NVCC) -c HW3.cu -I $(OPENCV_INCLUDEPATH) $(NVCC_OPTS) +# A target that is always out of date. +cmake_force: -loadSaveImage.o: loadSaveImage.cpp loadSaveImage.h - g++ -c loadSaveImage.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) +.PHONY : cmake_force -compare.o: compare.cpp compare.h - g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) +#============================================================================= +# Set environment variables for the build. -reference_calc.o: reference_calc.cpp reference_calc.h - g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) +# The shell in which to execute make rules. +SHELL = /bin/sh -student_func.o: student_func.cu utils.h - $(NVCC) -c student_func.cu $(NVCC_OPTS) +# The CMake executable. +CMAKE_COMMAND = /usr/bin/cmake +# The command to remove a file. +RM = /usr/bin/cmake -E remove -f + +# Escaping for special characters. +EQUALS = = + +# The top-level source directory on which CMake was run. +CMAKE_SOURCE_DIR = /home/wuqiongs/cs344 + +# The top-level build directory on which CMake was run. +CMAKE_BINARY_DIR = /home/wuqiongs/cs344 + +#============================================================================= +# Targets provided globally by CMake. + +# Special rule for the target edit_cache +edit_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..." + /usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available. +.PHONY : edit_cache + +# Special rule for the target edit_cache +edit_cache/fast: edit_cache + +.PHONY : edit_cache/fast + +# Special rule for the target rebuild_cache +rebuild_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..." + /usr/bin/cmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : rebuild_cache + +# Special rule for the target rebuild_cache +rebuild_cache/fast: rebuild_cache + +.PHONY : rebuild_cache/fast + +# The main all target +all: cmake_check_build_system + cd /home/wuqiongs/cs344 && $(CMAKE_COMMAND) -E cmake_progress_start /home/wuqiongs/cs344/CMakeFiles "/home/wuqiongs/cs344/Problem Sets/Problem Set 3/CMakeFiles/progress.marks" + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 3/all" + $(CMAKE_COMMAND) -E cmake_progress_start /home/wuqiongs/cs344/CMakeFiles 0 +.PHONY : all + +# The main clean target clean: - rm -f *.o hw - find . -type f -name '*.exr' | grep -v memorial | xargs rm -f + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 3/clean" +.PHONY : clean + +# The main clean target +clean/fast: clean + +.PHONY : clean/fast + +# Prepare targets for installation. +preinstall: all + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 3/preinstall" +.PHONY : preinstall + +# Prepare targets for installation. +preinstall/fast: + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 3/preinstall" +.PHONY : preinstall/fast + +# clear depends +depend: + cd /home/wuqiongs/cs344 && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1 +.PHONY : depend + +# Convenience name for target. +Problem\ Sets/Problem\ Set\ 3/CMakeFiles/HW3.dir/rule: + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 3/CMakeFiles/HW3.dir/rule" +.PHONY : Problem\ Sets/Problem\ Set\ 3/CMakeFiles/HW3.dir/rule + +# Convenience name for target. +HW3: Problem\ Sets/Problem\ Set\ 3/CMakeFiles/HW3.dir/rule + +.PHONY : HW3 + +# fast build rule for target. +HW3/fast: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 3/CMakeFiles/HW3.dir/build.make" "Problem Sets/Problem Set 3/CMakeFiles/HW3.dir/build" +.PHONY : HW3/fast + +compare.o: compare.cpp.o + +.PHONY : compare.o + +# target to build an object file +compare.cpp.o: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 3/CMakeFiles/HW3.dir/build.make" "Problem Sets/Problem Set 3/CMakeFiles/HW3.dir/compare.cpp.o" +.PHONY : compare.cpp.o + +compare.i: compare.cpp.i + +.PHONY : compare.i + +# target to preprocess a source file +compare.cpp.i: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 3/CMakeFiles/HW3.dir/build.make" "Problem Sets/Problem Set 3/CMakeFiles/HW3.dir/compare.cpp.i" +.PHONY : compare.cpp.i + +compare.s: compare.cpp.s + +.PHONY : compare.s + +# target to generate assembly for a file +compare.cpp.s: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 3/CMakeFiles/HW3.dir/build.make" "Problem Sets/Problem Set 3/CMakeFiles/HW3.dir/compare.cpp.s" +.PHONY : compare.cpp.s + +loadSaveImage.o: loadSaveImage.cpp.o + +.PHONY : loadSaveImage.o + +# target to build an object file +loadSaveImage.cpp.o: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 3/CMakeFiles/HW3.dir/build.make" "Problem Sets/Problem Set 3/CMakeFiles/HW3.dir/loadSaveImage.cpp.o" +.PHONY : loadSaveImage.cpp.o + +loadSaveImage.i: loadSaveImage.cpp.i + +.PHONY : loadSaveImage.i + +# target to preprocess a source file +loadSaveImage.cpp.i: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 3/CMakeFiles/HW3.dir/build.make" "Problem Sets/Problem Set 3/CMakeFiles/HW3.dir/loadSaveImage.cpp.i" +.PHONY : loadSaveImage.cpp.i + +loadSaveImage.s: loadSaveImage.cpp.s + +.PHONY : loadSaveImage.s + +# target to generate assembly for a file +loadSaveImage.cpp.s: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 3/CMakeFiles/HW3.dir/build.make" "Problem Sets/Problem Set 3/CMakeFiles/HW3.dir/loadSaveImage.cpp.s" +.PHONY : loadSaveImage.cpp.s + +main.o: main.cpp.o + +.PHONY : main.o + +# target to build an object file +main.cpp.o: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 3/CMakeFiles/HW3.dir/build.make" "Problem Sets/Problem Set 3/CMakeFiles/HW3.dir/main.cpp.o" +.PHONY : main.cpp.o + +main.i: main.cpp.i + +.PHONY : main.i + +# target to preprocess a source file +main.cpp.i: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 3/CMakeFiles/HW3.dir/build.make" "Problem Sets/Problem Set 3/CMakeFiles/HW3.dir/main.cpp.i" +.PHONY : main.cpp.i + +main.s: main.cpp.s + +.PHONY : main.s + +# target to generate assembly for a file +main.cpp.s: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 3/CMakeFiles/HW3.dir/build.make" "Problem Sets/Problem Set 3/CMakeFiles/HW3.dir/main.cpp.s" +.PHONY : main.cpp.s + +reference_calc.o: reference_calc.cpp.o + +.PHONY : reference_calc.o + +# target to build an object file +reference_calc.cpp.o: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 3/CMakeFiles/HW3.dir/build.make" "Problem Sets/Problem Set 3/CMakeFiles/HW3.dir/reference_calc.cpp.o" +.PHONY : reference_calc.cpp.o + +reference_calc.i: reference_calc.cpp.i + +.PHONY : reference_calc.i + +# target to preprocess a source file +reference_calc.cpp.i: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 3/CMakeFiles/HW3.dir/build.make" "Problem Sets/Problem Set 3/CMakeFiles/HW3.dir/reference_calc.cpp.i" +.PHONY : reference_calc.cpp.i + +reference_calc.s: reference_calc.cpp.s + +.PHONY : reference_calc.s + +# target to generate assembly for a file +reference_calc.cpp.s: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 3/CMakeFiles/HW3.dir/build.make" "Problem Sets/Problem Set 3/CMakeFiles/HW3.dir/reference_calc.cpp.s" +.PHONY : reference_calc.cpp.s + +# Help Target +help: + @echo "The following are some of the valid targets for this Makefile:" + @echo "... all (the default if no target is provided)" + @echo "... clean" + @echo "... depend" + @echo "... edit_cache" + @echo "... HW3" + @echo "... rebuild_cache" + @echo "... compare.o" + @echo "... compare.i" + @echo "... compare.s" + @echo "... loadSaveImage.o" + @echo "... loadSaveImage.i" + @echo "... loadSaveImage.s" + @echo "... main.o" + @echo "... main.i" + @echo "... main.s" + @echo "... reference_calc.o" + @echo "... reference_calc.i" + @echo "... reference_calc.s" +.PHONY : help + + + +#============================================================================= +# Special targets to cleanup operation of make. + +# Special rule to run CMake to check the build system integrity. +# No rule that depends on this can have commands that come from listfiles +# because they might be regenerated. +cmake_check_build_system: + cd /home/wuqiongs/cs344 && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0 +.PHONY : cmake_check_build_system + diff --git a/Problem Sets/Problem Set 4/Makefile b/Problem Sets/Problem Set 4/Makefile old mode 100755 new mode 100644 index 01a3efc0..4edf8592 --- a/Problem Sets/Problem Set 4/Makefile +++ b/Problem Sets/Problem Set 4/Makefile @@ -1,60 +1,270 @@ -NVCC=/usr/local/cuda-5.0/bin/nvcc -#NVCC=nvcc +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 3.5 -################################### -# These are the default install # -# locations on most linux distros # -################################### +# Default target executed when no arguments are given to make. +default_target: all -OPENCV_LIBPATH=/usr/lib -OPENCV_INCLUDEPATH=/usr/include +.PHONY : default_target -################################################### -# On Macs the default install locations are below # -################################################### +# Allow only one "make -f Makefile2" at a time, but pass parallelism. +.NOTPARALLEL: -#OPENCV_LIBPATH=/usr/local/lib -#OPENCV_INCLUDEPATH=/usr/local/include -OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui +#============================================================================= +# Special targets provided by cmake. -CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include -# CUDA_INCLUDEPATH=/usr/local/cuda/lib64/include -# CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include -# CUDA_INCLUDEPATH=/Developer/NVIDIA/CUDA-5.0/include +# Disable implicit rules so canonical targets will work. +.SUFFIXES: -###################################################### -# On Macs the default install locations are below # -# #################################################### -#CUDA_INCLUDEPATH=/usr/local/cuda/include -#CUDA_LIBPATH=/usr/local/cuda/lib -CUDA_LIBPATH=/usr/local/cuda-5.0/lib64 +# Remove some rules from gmake that .SUFFIXES does not remove. +SUFFIXES = -NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64 +.SUFFIXES: .hpux_make_needs_suffix_list -GCC_OPTS=-O3 -Wall -Wextra -m64 -student: main.o student_func.o HW4.o loadSaveImage.o compare.o reference_calc.o Makefile - $(NVCC) -o HW4 main.o student_func.o HW4.o loadSaveImage.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS) +# Suppress display of executed commands. +$(VERBOSE).SILENT: -main.o: main.cpp timer.h utils.h reference_calc.h - g++ -c main.cpp $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) -HW4.o: HW4.cu loadSaveImage.h utils.h - $(NVCC) -c HW4.cu -I $(OPENCV_INCLUDEPATH) $(NVCC_OPTS) +# A target that is always out of date. +cmake_force: -loadSaveImage.o: loadSaveImage.cpp loadSaveImage.h - g++ -c loadSaveImage.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) +.PHONY : cmake_force -compare.o: compare.cpp compare.h - g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) +#============================================================================= +# Set environment variables for the build. -reference_calc.o: reference_calc.cpp reference_calc.h - g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) +# The shell in which to execute make rules. +SHELL = /bin/sh -student_func.o: student_func.cu reference_calc.cpp utils.h - $(NVCC) -c student_func.cu $(NVCC_OPTS) +# The CMake executable. +CMAKE_COMMAND = /usr/bin/cmake +# The command to remove a file. +RM = /usr/bin/cmake -E remove -f + +# Escaping for special characters. +EQUALS = = + +# The top-level source directory on which CMake was run. +CMAKE_SOURCE_DIR = /home/wuqiongs/cs344 + +# The top-level build directory on which CMake was run. +CMAKE_BINARY_DIR = /home/wuqiongs/cs344 + +#============================================================================= +# Targets provided globally by CMake. + +# Special rule for the target edit_cache +edit_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..." + /usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available. +.PHONY : edit_cache + +# Special rule for the target edit_cache +edit_cache/fast: edit_cache + +.PHONY : edit_cache/fast + +# Special rule for the target rebuild_cache +rebuild_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..." + /usr/bin/cmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : rebuild_cache + +# Special rule for the target rebuild_cache +rebuild_cache/fast: rebuild_cache + +.PHONY : rebuild_cache/fast + +# The main all target +all: cmake_check_build_system + cd /home/wuqiongs/cs344 && $(CMAKE_COMMAND) -E cmake_progress_start /home/wuqiongs/cs344/CMakeFiles "/home/wuqiongs/cs344/Problem Sets/Problem Set 4/CMakeFiles/progress.marks" + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 4/all" + $(CMAKE_COMMAND) -E cmake_progress_start /home/wuqiongs/cs344/CMakeFiles 0 +.PHONY : all + +# The main clean target clean: - rm -f *.o *.png hw + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 4/clean" +.PHONY : clean + +# The main clean target +clean/fast: clean + +.PHONY : clean/fast + +# Prepare targets for installation. +preinstall: all + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 4/preinstall" +.PHONY : preinstall + +# Prepare targets for installation. +preinstall/fast: + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 4/preinstall" +.PHONY : preinstall/fast + +# clear depends +depend: + cd /home/wuqiongs/cs344 && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1 +.PHONY : depend + +# Convenience name for target. +Problem\ Sets/Problem\ Set\ 4/CMakeFiles/HW4.dir/rule: + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 4/CMakeFiles/HW4.dir/rule" +.PHONY : Problem\ Sets/Problem\ Set\ 4/CMakeFiles/HW4.dir/rule + +# Convenience name for target. +HW4: Problem\ Sets/Problem\ Set\ 4/CMakeFiles/HW4.dir/rule + +.PHONY : HW4 + +# fast build rule for target. +HW4/fast: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 4/CMakeFiles/HW4.dir/build.make" "Problem Sets/Problem Set 4/CMakeFiles/HW4.dir/build" +.PHONY : HW4/fast + +compare.o: compare.cpp.o + +.PHONY : compare.o + +# target to build an object file +compare.cpp.o: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 4/CMakeFiles/HW4.dir/build.make" "Problem Sets/Problem Set 4/CMakeFiles/HW4.dir/compare.cpp.o" +.PHONY : compare.cpp.o + +compare.i: compare.cpp.i + +.PHONY : compare.i + +# target to preprocess a source file +compare.cpp.i: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 4/CMakeFiles/HW4.dir/build.make" "Problem Sets/Problem Set 4/CMakeFiles/HW4.dir/compare.cpp.i" +.PHONY : compare.cpp.i + +compare.s: compare.cpp.s + +.PHONY : compare.s + +# target to generate assembly for a file +compare.cpp.s: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 4/CMakeFiles/HW4.dir/build.make" "Problem Sets/Problem Set 4/CMakeFiles/HW4.dir/compare.cpp.s" +.PHONY : compare.cpp.s + +loadSaveImage.o: loadSaveImage.cpp.o + +.PHONY : loadSaveImage.o + +# target to build an object file +loadSaveImage.cpp.o: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 4/CMakeFiles/HW4.dir/build.make" "Problem Sets/Problem Set 4/CMakeFiles/HW4.dir/loadSaveImage.cpp.o" +.PHONY : loadSaveImage.cpp.o + +loadSaveImage.i: loadSaveImage.cpp.i + +.PHONY : loadSaveImage.i + +# target to preprocess a source file +loadSaveImage.cpp.i: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 4/CMakeFiles/HW4.dir/build.make" "Problem Sets/Problem Set 4/CMakeFiles/HW4.dir/loadSaveImage.cpp.i" +.PHONY : loadSaveImage.cpp.i + +loadSaveImage.s: loadSaveImage.cpp.s + +.PHONY : loadSaveImage.s + +# target to generate assembly for a file +loadSaveImage.cpp.s: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 4/CMakeFiles/HW4.dir/build.make" "Problem Sets/Problem Set 4/CMakeFiles/HW4.dir/loadSaveImage.cpp.s" +.PHONY : loadSaveImage.cpp.s + +main.o: main.cpp.o + +.PHONY : main.o + +# target to build an object file +main.cpp.o: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 4/CMakeFiles/HW4.dir/build.make" "Problem Sets/Problem Set 4/CMakeFiles/HW4.dir/main.cpp.o" +.PHONY : main.cpp.o + +main.i: main.cpp.i + +.PHONY : main.i + +# target to preprocess a source file +main.cpp.i: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 4/CMakeFiles/HW4.dir/build.make" "Problem Sets/Problem Set 4/CMakeFiles/HW4.dir/main.cpp.i" +.PHONY : main.cpp.i + +main.s: main.cpp.s + +.PHONY : main.s + +# target to generate assembly for a file +main.cpp.s: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 4/CMakeFiles/HW4.dir/build.make" "Problem Sets/Problem Set 4/CMakeFiles/HW4.dir/main.cpp.s" +.PHONY : main.cpp.s + +reference_calc.o: reference_calc.cpp.o + +.PHONY : reference_calc.o + +# target to build an object file +reference_calc.cpp.o: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 4/CMakeFiles/HW4.dir/build.make" "Problem Sets/Problem Set 4/CMakeFiles/HW4.dir/reference_calc.cpp.o" +.PHONY : reference_calc.cpp.o + +reference_calc.i: reference_calc.cpp.i + +.PHONY : reference_calc.i + +# target to preprocess a source file +reference_calc.cpp.i: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 4/CMakeFiles/HW4.dir/build.make" "Problem Sets/Problem Set 4/CMakeFiles/HW4.dir/reference_calc.cpp.i" +.PHONY : reference_calc.cpp.i + +reference_calc.s: reference_calc.cpp.s + +.PHONY : reference_calc.s + +# target to generate assembly for a file +reference_calc.cpp.s: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 4/CMakeFiles/HW4.dir/build.make" "Problem Sets/Problem Set 4/CMakeFiles/HW4.dir/reference_calc.cpp.s" +.PHONY : reference_calc.cpp.s + +# Help Target +help: + @echo "The following are some of the valid targets for this Makefile:" + @echo "... all (the default if no target is provided)" + @echo "... clean" + @echo "... depend" + @echo "... edit_cache" + @echo "... HW4" + @echo "... rebuild_cache" + @echo "... compare.o" + @echo "... compare.i" + @echo "... compare.s" + @echo "... loadSaveImage.o" + @echo "... loadSaveImage.i" + @echo "... loadSaveImage.s" + @echo "... main.o" + @echo "... main.i" + @echo "... main.s" + @echo "... reference_calc.o" + @echo "... reference_calc.i" + @echo "... reference_calc.s" +.PHONY : help + + + +#============================================================================= +# Special targets to cleanup operation of make. + +# Special rule to run CMake to check the build system integrity. +# No rule that depends on this can have commands that come from listfiles +# because they might be regenerated. +cmake_check_build_system: + cd /home/wuqiongs/cs344 && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0 +.PHONY : cmake_check_build_system + diff --git a/Problem Sets/Problem Set 5/Makefile b/Problem Sets/Problem Set 5/Makefile old mode 100755 new mode 100644 index d128b74a..ad73b9f8 --- a/Problem Sets/Problem Set 5/Makefile +++ b/Problem Sets/Problem Set 5/Makefile @@ -1,14 +1,180 @@ -NVCC=nvcc -NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64 +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 3.5 -histo: main.cu reference_calc.o student.o Makefile - nvcc -o HW5 main.cu reference_calc.o student.o $(NVCC_OPTS) +# Default target executed when no arguments are given to make. +default_target: all -student.o: student.cu - nvcc -c student.cu $(NVCC_OPTS) +.PHONY : default_target -reference_calc.o: reference_calc.cpp reference_calc.h - g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) +# Allow only one "make -f Makefile2" at a time, but pass parallelism. +.NOTPARALLEL: + +#============================================================================= +# Special targets provided by cmake. + +# Disable implicit rules so canonical targets will work. +.SUFFIXES: + + +# Remove some rules from gmake that .SUFFIXES does not remove. +SUFFIXES = + +.SUFFIXES: .hpux_make_needs_suffix_list + + +# Suppress display of executed commands. +$(VERBOSE).SILENT: + + +# A target that is always out of date. +cmake_force: + +.PHONY : cmake_force + +#============================================================================= +# Set environment variables for the build. + +# The shell in which to execute make rules. +SHELL = /bin/sh + +# The CMake executable. +CMAKE_COMMAND = /usr/bin/cmake + +# The command to remove a file. +RM = /usr/bin/cmake -E remove -f + +# Escaping for special characters. +EQUALS = = + +# The top-level source directory on which CMake was run. +CMAKE_SOURCE_DIR = /home/wuqiongs/cs344 + +# The top-level build directory on which CMake was run. +CMAKE_BINARY_DIR = /home/wuqiongs/cs344 + +#============================================================================= +# Targets provided globally by CMake. + +# Special rule for the target edit_cache +edit_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..." + /usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available. +.PHONY : edit_cache + +# Special rule for the target edit_cache +edit_cache/fast: edit_cache + +.PHONY : edit_cache/fast + +# Special rule for the target rebuild_cache +rebuild_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..." + /usr/bin/cmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : rebuild_cache + +# Special rule for the target rebuild_cache +rebuild_cache/fast: rebuild_cache + +.PHONY : rebuild_cache/fast + +# The main all target +all: cmake_check_build_system + cd /home/wuqiongs/cs344 && $(CMAKE_COMMAND) -E cmake_progress_start /home/wuqiongs/cs344/CMakeFiles "/home/wuqiongs/cs344/Problem Sets/Problem Set 5/CMakeFiles/progress.marks" + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 5/all" + $(CMAKE_COMMAND) -E cmake_progress_start /home/wuqiongs/cs344/CMakeFiles 0 +.PHONY : all + +# The main clean target clean: - rm -f *.o hw *.bin + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 5/clean" +.PHONY : clean + +# The main clean target +clean/fast: clean + +.PHONY : clean/fast + +# Prepare targets for installation. +preinstall: all + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 5/preinstall" +.PHONY : preinstall + +# Prepare targets for installation. +preinstall/fast: + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 5/preinstall" +.PHONY : preinstall/fast + +# clear depends +depend: + cd /home/wuqiongs/cs344 && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1 +.PHONY : depend + +# Convenience name for target. +Problem\ Sets/Problem\ Set\ 5/CMakeFiles/HW5.dir/rule: + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 5/CMakeFiles/HW5.dir/rule" +.PHONY : Problem\ Sets/Problem\ Set\ 5/CMakeFiles/HW5.dir/rule + +# Convenience name for target. +HW5: Problem\ Sets/Problem\ Set\ 5/CMakeFiles/HW5.dir/rule + +.PHONY : HW5 + +# fast build rule for target. +HW5/fast: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 5/CMakeFiles/HW5.dir/build.make" "Problem Sets/Problem Set 5/CMakeFiles/HW5.dir/build" +.PHONY : HW5/fast + +reference_calc.o: reference_calc.cpp.o + +.PHONY : reference_calc.o + +# target to build an object file +reference_calc.cpp.o: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 5/CMakeFiles/HW5.dir/build.make" "Problem Sets/Problem Set 5/CMakeFiles/HW5.dir/reference_calc.cpp.o" +.PHONY : reference_calc.cpp.o + +reference_calc.i: reference_calc.cpp.i + +.PHONY : reference_calc.i + +# target to preprocess a source file +reference_calc.cpp.i: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 5/CMakeFiles/HW5.dir/build.make" "Problem Sets/Problem Set 5/CMakeFiles/HW5.dir/reference_calc.cpp.i" +.PHONY : reference_calc.cpp.i + +reference_calc.s: reference_calc.cpp.s + +.PHONY : reference_calc.s + +# target to generate assembly for a file +reference_calc.cpp.s: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 5/CMakeFiles/HW5.dir/build.make" "Problem Sets/Problem Set 5/CMakeFiles/HW5.dir/reference_calc.cpp.s" +.PHONY : reference_calc.cpp.s + +# Help Target +help: + @echo "The following are some of the valid targets for this Makefile:" + @echo "... all (the default if no target is provided)" + @echo "... clean" + @echo "... depend" + @echo "... edit_cache" + @echo "... HW5" + @echo "... rebuild_cache" + @echo "... reference_calc.o" + @echo "... reference_calc.i" + @echo "... reference_calc.s" +.PHONY : help + + + +#============================================================================= +# Special targets to cleanup operation of make. + +# Special rule to run CMake to check the build system integrity. +# No rule that depends on this can have commands that come from listfiles +# because they might be regenerated. +cmake_check_build_system: + cd /home/wuqiongs/cs344 && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0 +.PHONY : cmake_check_build_system + diff --git a/Problem Sets/Problem Set 5/main.cu b/Problem Sets/Problem Set 5/main.cu index 15071cd4..1a2f9022 100755 --- a/Problem Sets/Problem Set 5/main.cu +++ b/Problem Sets/Problem Set 5/main.cu @@ -51,7 +51,7 @@ int main(void) thrust::minstd_rand rng; - thrust::random::experimental::normal_distribution normalDist((float)mean, stddev); + thrust::random::normal_distribution normalDist((float)mean, stddev); // Generate the random values for (size_t i = 0; i < numElems; ++i) { diff --git a/Problem Sets/Problem Set 6/Makefile b/Problem Sets/Problem Set 6/Makefile old mode 100755 new mode 100644 index c313cb29..de2a7e51 --- a/Problem Sets/Problem Set 6/Makefile +++ b/Problem Sets/Problem Set 6/Makefile @@ -1,63 +1,270 @@ -NVCC=/usr/local/cuda-5.0/bin/nvcc -#NVCC=nvcc +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 3.5 -################################### -# These are the default install # -# locations on most linux distros # -################################### +# Default target executed when no arguments are given to make. +default_target: all -OPENCV_LIBPATH=/usr/lib -OPENCV_INCLUDEPATH=/usr/include +.PHONY : default_target -################################################### -# On Macs the default install locations are below # -################################################### +# Allow only one "make -f Makefile2" at a time, but pass parallelism. +.NOTPARALLEL: -#OPENCV_LIBPATH=/usr/local/lib -#OPENCV_INCLUDEPATH=/usr/local/include -OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui +#============================================================================= +# Special targets provided by cmake. -CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include -# CUDA_INCLUDEPATH=/usr/local/cuda/lib64/include -# CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include -# CUDA_INCLUDEPATH=/Developer/NVIDIA/CUDA-5.0/include +# Disable implicit rules so canonical targets will work. +.SUFFIXES: -###################################################### -# On Macs the default install locations are below # -# #################################################### -#CUDA_INCLUDEPATH=/usr/local/cuda/include -#CUDA_LIBPATH=/usr/local/cuda/lib -CUDA_LIBPATH=/usr/local/cuda-5.0/lib64 +# Remove some rules from gmake that .SUFFIXES does not remove. +SUFFIXES = -#no warnings otherwise thrust explodes output +.SUFFIXES: .hpux_make_needs_suffix_list -NVCC_OPTS=-O3 -arch=sm_20 -m64 -GCC_OPTS=-O3 -m64 +# Suppress display of executed commands. +$(VERBOSE).SILENT: -student: main.o student_func.o HW6.o loadSaveImage.o compare.o reference_calc.o Makefile - $(NVCC) -o HW6 main.o student_func.o HW6.o loadSaveImage.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS) -main.o: main.cpp timer.h utils.h - g++ -c main.cpp $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) +# A target that is always out of date. +cmake_force: -HW6.o: HW6.cu loadSaveImage.h utils.h - $(NVCC) -c HW6.cu -I $(OPENCV_INCLUDEPATH) $(NVCC_OPTS) +.PHONY : cmake_force -loadSaveImage.o: loadSaveImage.cpp loadSaveImage.h - g++ -c loadSaveImage.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) +#============================================================================= +# Set environment variables for the build. -student_func.o: student_func.cu reference_calc.cpp utils.h - $(NVCC) -c student_func.cu $(NVCC_OPTS) +# The shell in which to execute make rules. +SHELL = /bin/sh -compare.o: compare.cpp compare.h - g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) +# The CMake executable. +CMAKE_COMMAND = /usr/bin/cmake -reference_calc.o: reference_calc.cpp reference_calc.h - g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) +# The command to remove a file. +RM = /usr/bin/cmake -E remove -f +# Escaping for special characters. +EQUALS = = + +# The top-level source directory on which CMake was run. +CMAKE_SOURCE_DIR = /home/wuqiongs/cs344 + +# The top-level build directory on which CMake was run. +CMAKE_BINARY_DIR = /home/wuqiongs/cs344 + +#============================================================================= +# Targets provided globally by CMake. + +# Special rule for the target edit_cache +edit_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..." + /usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available. +.PHONY : edit_cache + +# Special rule for the target edit_cache +edit_cache/fast: edit_cache + +.PHONY : edit_cache/fast + +# Special rule for the target rebuild_cache +rebuild_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..." + /usr/bin/cmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : rebuild_cache + +# Special rule for the target rebuild_cache +rebuild_cache/fast: rebuild_cache + +.PHONY : rebuild_cache/fast + +# The main all target +all: cmake_check_build_system + cd /home/wuqiongs/cs344 && $(CMAKE_COMMAND) -E cmake_progress_start /home/wuqiongs/cs344/CMakeFiles "/home/wuqiongs/cs344/Problem Sets/Problem Set 6/CMakeFiles/progress.marks" + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 6/all" + $(CMAKE_COMMAND) -E cmake_progress_start /home/wuqiongs/cs344/CMakeFiles 0 +.PHONY : all + +# The main clean target clean: - rm -f *.o hw - find . -type f -name '*.png' | grep -v source.png | grep -v destination.png | xargs rm -f + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 6/clean" +.PHONY : clean + +# The main clean target +clean/fast: clean + +.PHONY : clean/fast + +# Prepare targets for installation. +preinstall: all + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 6/preinstall" +.PHONY : preinstall + +# Prepare targets for installation. +preinstall/fast: + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 6/preinstall" +.PHONY : preinstall/fast + +# clear depends +depend: + cd /home/wuqiongs/cs344 && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1 +.PHONY : depend + +# Convenience name for target. +Problem\ Sets/Problem\ Set\ 6/CMakeFiles/HW6.dir/rule: + cd /home/wuqiongs/cs344 && $(MAKE) -f CMakeFiles/Makefile2 "Problem Sets/Problem Set 6/CMakeFiles/HW6.dir/rule" +.PHONY : Problem\ Sets/Problem\ Set\ 6/CMakeFiles/HW6.dir/rule + +# Convenience name for target. +HW6: Problem\ Sets/Problem\ Set\ 6/CMakeFiles/HW6.dir/rule + +.PHONY : HW6 + +# fast build rule for target. +HW6/fast: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 6/CMakeFiles/HW6.dir/build.make" "Problem Sets/Problem Set 6/CMakeFiles/HW6.dir/build" +.PHONY : HW6/fast + +compare.o: compare.cpp.o + +.PHONY : compare.o + +# target to build an object file +compare.cpp.o: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 6/CMakeFiles/HW6.dir/build.make" "Problem Sets/Problem Set 6/CMakeFiles/HW6.dir/compare.cpp.o" +.PHONY : compare.cpp.o + +compare.i: compare.cpp.i + +.PHONY : compare.i + +# target to preprocess a source file +compare.cpp.i: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 6/CMakeFiles/HW6.dir/build.make" "Problem Sets/Problem Set 6/CMakeFiles/HW6.dir/compare.cpp.i" +.PHONY : compare.cpp.i + +compare.s: compare.cpp.s + +.PHONY : compare.s + +# target to generate assembly for a file +compare.cpp.s: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 6/CMakeFiles/HW6.dir/build.make" "Problem Sets/Problem Set 6/CMakeFiles/HW6.dir/compare.cpp.s" +.PHONY : compare.cpp.s + +loadSaveImage.o: loadSaveImage.cpp.o + +.PHONY : loadSaveImage.o + +# target to build an object file +loadSaveImage.cpp.o: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 6/CMakeFiles/HW6.dir/build.make" "Problem Sets/Problem Set 6/CMakeFiles/HW6.dir/loadSaveImage.cpp.o" +.PHONY : loadSaveImage.cpp.o + +loadSaveImage.i: loadSaveImage.cpp.i + +.PHONY : loadSaveImage.i + +# target to preprocess a source file +loadSaveImage.cpp.i: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 6/CMakeFiles/HW6.dir/build.make" "Problem Sets/Problem Set 6/CMakeFiles/HW6.dir/loadSaveImage.cpp.i" +.PHONY : loadSaveImage.cpp.i + +loadSaveImage.s: loadSaveImage.cpp.s + +.PHONY : loadSaveImage.s + +# target to generate assembly for a file +loadSaveImage.cpp.s: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 6/CMakeFiles/HW6.dir/build.make" "Problem Sets/Problem Set 6/CMakeFiles/HW6.dir/loadSaveImage.cpp.s" +.PHONY : loadSaveImage.cpp.s + +main.o: main.cpp.o + +.PHONY : main.o + +# target to build an object file +main.cpp.o: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 6/CMakeFiles/HW6.dir/build.make" "Problem Sets/Problem Set 6/CMakeFiles/HW6.dir/main.cpp.o" +.PHONY : main.cpp.o + +main.i: main.cpp.i + +.PHONY : main.i + +# target to preprocess a source file +main.cpp.i: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 6/CMakeFiles/HW6.dir/build.make" "Problem Sets/Problem Set 6/CMakeFiles/HW6.dir/main.cpp.i" +.PHONY : main.cpp.i + +main.s: main.cpp.s + +.PHONY : main.s + +# target to generate assembly for a file +main.cpp.s: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 6/CMakeFiles/HW6.dir/build.make" "Problem Sets/Problem Set 6/CMakeFiles/HW6.dir/main.cpp.s" +.PHONY : main.cpp.s + +reference_calc.o: reference_calc.cpp.o + +.PHONY : reference_calc.o + +# target to build an object file +reference_calc.cpp.o: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 6/CMakeFiles/HW6.dir/build.make" "Problem Sets/Problem Set 6/CMakeFiles/HW6.dir/reference_calc.cpp.o" +.PHONY : reference_calc.cpp.o + +reference_calc.i: reference_calc.cpp.i + +.PHONY : reference_calc.i + +# target to preprocess a source file +reference_calc.cpp.i: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 6/CMakeFiles/HW6.dir/build.make" "Problem Sets/Problem Set 6/CMakeFiles/HW6.dir/reference_calc.cpp.i" +.PHONY : reference_calc.cpp.i + +reference_calc.s: reference_calc.cpp.s + +.PHONY : reference_calc.s + +# target to generate assembly for a file +reference_calc.cpp.s: + cd /home/wuqiongs/cs344 && $(MAKE) -f "Problem Sets/Problem Set 6/CMakeFiles/HW6.dir/build.make" "Problem Sets/Problem Set 6/CMakeFiles/HW6.dir/reference_calc.cpp.s" +.PHONY : reference_calc.cpp.s + +# Help Target +help: + @echo "The following are some of the valid targets for this Makefile:" + @echo "... all (the default if no target is provided)" + @echo "... clean" + @echo "... depend" + @echo "... edit_cache" + @echo "... HW6" + @echo "... rebuild_cache" + @echo "... compare.o" + @echo "... compare.i" + @echo "... compare.s" + @echo "... loadSaveImage.o" + @echo "... loadSaveImage.i" + @echo "... loadSaveImage.s" + @echo "... main.o" + @echo "... main.i" + @echo "... main.s" + @echo "... reference_calc.o" + @echo "... reference_calc.i" + @echo "... reference_calc.s" +.PHONY : help + + + +#============================================================================= +# Special targets to cleanup operation of make. + +# Special rule to run CMake to check the build system integrity. +# No rule that depends on this can have commands that come from listfiles +# because they might be regenerated. +cmake_check_build_system: + cd /home/wuqiongs/cs344 && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0 +.PHONY : cmake_check_build_system + From c3c06c78ece8bf741fb0fab0a6407dd04a1a8142 Mon Sep 17 00:00:00 2001 From: Qiongsi Wu Date: Wed, 1 Aug 2018 18:20:46 -0400 Subject: [PATCH 02/15] PS1 --- Problem Sets/Problem Set 1/student_func.cu | 26 +++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/Problem Sets/Problem Set 1/student_func.cu b/Problem Sets/Problem Set 1/student_func.cu index 452b379f..de5af4ae 100755 --- a/Problem Sets/Problem Set 1/student_func.cu +++ b/Problem Sets/Problem Set 1/student_func.cu @@ -33,12 +33,16 @@ #include "utils.h" +// This is used for Quadro P4000. +// In total, a 2D block has 32 * 32 = 1024 threads, +// the maximum on a P4000. +int BLOCK_DIM = 32; + __global__ void rgba_to_greyscale(const uchar4* const rgbaImage, unsigned char* const greyImage, int numRows, int numCols) { - //TODO //Fill in the kernel to convert from color to greyscale //the mapping from components of a uchar4 to RGBA is: // .x -> R ; .y -> G ; .z -> B ; .w -> A @@ -50,6 +54,16 @@ void rgba_to_greyscale(const uchar4* const rgbaImage, //First create a mapping from the 2D block and grid locations //to an absolute 2D location in the image, then use that to //calculate a 1D offset + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; + + if (x < numCols && y < numRows) { + int offset = y * numCols + x; + uchar4 rgba = rgbaImage[offset]; + + float channelSum = 0.299f * rgba.x + 0.587f * rgba.y + 0.114f * rgba.z; + greyImage[offset] = channelSum; + } } void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage, @@ -57,8 +71,14 @@ void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_r { //You must fill in the correct sizes for the blockSize and gridSize //currently only one block with one thread is being launched - const dim3 blockSize(1, 1, 1); //TODO - const dim3 gridSize( 1, 1, 1); //TODO + const dim3 blockSize(BLOCK_DIM, BLOCK_DIM, 1); + + //Compute the sizes of the grid + int gridRow = (numRows + BLOCK_DIM - 1) / BLOCK_DIM; + int gridCol = (numCols + BLOCK_DIM - 1) / BLOCK_DIM; + + // Crucial! Col is for x index, Row is for y index! + const dim3 gridSize(gridCol, gridRow, 1); rgba_to_greyscale<<>>(d_rgbaImage, d_greyImage, numRows, numCols); cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); From c9e3a9c74517615f4aaa5565f8aea6b91d035c84 Mon Sep 17 00:00:00 2001 From: Qiongsi Wu Date: Wed, 1 Aug 2018 22:08:24 -0400 Subject: [PATCH 03/15] PS1 change block size to 16 x 16 --- Problem Sets/Problem Set 1/student_func.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Problem Sets/Problem Set 1/student_func.cu b/Problem Sets/Problem Set 1/student_func.cu index de5af4ae..dee10518 100755 --- a/Problem Sets/Problem Set 1/student_func.cu +++ b/Problem Sets/Problem Set 1/student_func.cu @@ -35,8 +35,10 @@ // This is used for Quadro P4000. // In total, a 2D block has 32 * 32 = 1024 threads, -// the maximum on a P4000. -int BLOCK_DIM = 32; +// the maximum on a P4000. +// Somehow 16 x 16 block size performs better than +// 32 x 32 block size. +int BLOCK_DIM = 16; __global__ void rgba_to_greyscale(const uchar4* const rgbaImage, From 55074e4c1f0bc2e7f8186b4f65c9bce3df3cb2dc Mon Sep 17 00:00:00 2001 From: Qiongsi Wu Date: Thu, 2 Aug 2018 15:28:33 -0400 Subject: [PATCH 04/15] PS2 passing test --- Problem Sets/Problem Set 2/student_func.cu | 92 ++++++++++++++++------ 1 file changed, 69 insertions(+), 23 deletions(-) diff --git a/Problem Sets/Problem Set 2/student_func.cu b/Problem Sets/Problem Set 2/student_func.cu index 825e412b..870fb592 100755 --- a/Problem Sets/Problem Set 2/student_func.cu +++ b/Problem Sets/Problem Set 2/student_func.cu @@ -108,27 +108,45 @@ void gaussian_blur(const unsigned char* const inputChannel, int numRows, int numCols, const float* const filter, const int filterWidth) { - // TODO - // NOTE: Be sure to compute any intermediate results in floating point // before storing the final result as unsigned char. - // NOTE: Be careful not to try to access memory that is outside the bounds of - // the image. You'll want code that performs the following check before accessing - // GPU memory: - // - // if ( absolute_image_position_x >= numCols || - // absolute_image_position_y >= numRows ) - // { - // return; - // } - // NOTE: If a thread's absolute position 2D position is within the image, but some of // its neighbors are outside the image, then you will need to be extra careful. Instead // of trying to read such a neighbor value from GPU memory (which won't work because // the value is out of bounds), you should explicitly clamp the neighbor values you read // to be within the bounds of the image. If this is not clear to you, then please refer // to sequential reference solution for the exact clamping semantics you should follow. + + const int2 thread_2D_pos = make_int2( blockIdx.x * blockDim.x + threadIdx.x, + blockIdx.y * blockDim.y + threadIdx.y); + + //make sure we don't try and access memory outside the image + //by having any threads mapped there return early + if (thread_2D_pos.x >= numCols || thread_2D_pos.y >= numRows) + return; + + // Gaussian blur for point at thread_2d_pos + int r = thread_2D_pos.y; + int c = thread_2D_pos.x; + float result = 0.f; + for (int filter_r = - filterWidth/2; filter_r <= filterWidth/2; filter_r++) { + for (int filter_c = -filterWidth/2; filter_c <= filterWidth/2; filter_c++) { + int image_r = min(max(r + filter_r, 0), numRows - 1); + int image_c = min(max(c + filter_c, 0), numCols - 1); + + int offset = image_r * numCols + image_c; + float imageValue = (float)inputChannel[offset]; + + int filterOffset = (filter_r + filterWidth/2) * filterWidth + + filter_c + filterWidth/2; + float filterValue = filter[filterOffset]; + result += imageValue * filterValue; + } + } + + int outputOffset = r * numCols + c; + outputChannel[outputOffset] = result; } //This kernel takes in an image represented as a uchar4 and splits @@ -141,8 +159,6 @@ void separateChannels(const uchar4* const inputImageRGBA, unsigned char* const greenChannel, unsigned char* const blueChannel) { - // TODO - // // NOTE: Be careful not to try to access memory that is outside the bounds of // the image. You'll want code that performs the following check before accessing // GPU memory: @@ -152,6 +168,23 @@ void separateChannels(const uchar4* const inputImageRGBA, // { // return; // } + + int absolute_image_position_x = blockIdx.x * blockDim.x + threadIdx.x; + int absolute_image_position_y = blockIdx.y * blockDim.y + threadIdx.y; + + if ( absolute_image_position_x >= numCols || + absolute_image_position_y >= numRows ) + { + return; + } + + int offset = absolute_image_position_y * numCols + + absolute_image_position_x; + + uchar4 rgba = inputImageRGBA[offset]; + redChannel[offset] = rgba.x; + greenChannel[offset] = rgba.y; + blueChannel[offset] = rgba.z; } //This kernel takes in three color channels and recombines them @@ -198,19 +231,20 @@ void allocateMemoryAndCopyToGPU(const size_t numRowsImage, const size_t numColsI checkCudaErrors(cudaMalloc(&d_green, sizeof(unsigned char) * numRowsImage * numColsImage)); checkCudaErrors(cudaMalloc(&d_blue, sizeof(unsigned char) * numRowsImage * numColsImage)); - //TODO: + size_t filterSize = sizeof(float) * filterWidth * filterWidth; //Allocate memory for the filter on the GPU //Use the pointer d_filter that we have already declared for you //You need to allocate memory for the filter with cudaMalloc //be sure to use checkCudaErrors like the above examples to //be able to tell if anything goes wrong //IMPORTANT: Notice that we pass a pointer to a pointer to cudaMalloc + checkCudaErrors(cudaMalloc(&d_filter, filterSize)); - //TODO: //Copy the filter on the host (h_filter) to the memory you just allocated //on the GPU. cudaMemcpy(dst, src, numBytes, cudaMemcpyHostToDevice); //Remember to use checkCudaErrors! - + checkCudaErrors(cudaMemcpy((void *)d_filter, (void *)h_filter, filterSize, + cudaMemcpyHostToDevice)); } void your_gaussian_blur(const uchar4 * const h_inputImageRGBA, uchar4 * const d_inputImageRGBA, @@ -220,21 +254,31 @@ void your_gaussian_blur(const uchar4 * const h_inputImageRGBA, uchar4 * const d_ unsigned char *d_blueBlurred, const int filterWidth) { - //TODO: Set reasonable block size (i.e., number of threads per block) - const dim3 blockSize; + // Set reasonable block size (i.e., number of threads per block) + const int blk_dim = 16; + const dim3 blockSize(blk_dim, blk_dim, 1); - //TODO: //Compute correct grid size (i.e., number of blocks per kernel launch) //from the image size and and block size. - const dim3 gridSize; + int gridCols = (numCols + blk_dim - 1) / blk_dim; + int gridRows = (numRows + blk_dim - 1) / blk_dim; + const dim3 gridSize(gridCols, gridRows, 1); - //TODO: Launch a kernel for separating the RGBA image into different color channels + //Launch a kernel for separating the RGBA image into different color channels + separateChannels<<>>(d_inputImageRGBA, numRows, numCols, + d_red, d_green, d_blue); // Call cudaDeviceSynchronize(), then call checkCudaErrors() immediately after // launching your kernel to make sure that you didn't make any mistakes. cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); - //TODO: Call your convolution kernel here 3 times, once for each color channel. + //Call your convolution kernel here 3 times, once for each color channel. + gaussian_blur<<>>(d_red, d_redBlurred, numRows, numCols, + d_filter, filterWidth); + gaussian_blur<<>>(d_green, d_greenBlurred, numRows, numCols, + d_filter, filterWidth); + gaussian_blur<<>>(d_blue, d_blueBlurred, numRows, numCols, + d_filter, filterWidth); // Again, call cudaDeviceSynchronize(), then call checkCudaErrors() immediately after // launching your kernel to make sure that you didn't make any mistakes. @@ -257,8 +301,10 @@ void your_gaussian_blur(const uchar4 * const h_inputImageRGBA, uchar4 * const d_ //Free all the memory that we allocated //TODO: make sure you free any arrays that you allocated +//Done - added code to free d_filter. void cleanup() { checkCudaErrors(cudaFree(d_red)); checkCudaErrors(cudaFree(d_green)); checkCudaErrors(cudaFree(d_blue)); + checkCudaErrors(cudaFree(d_filter)); } From a91894b2f1118a16ac020d359510e6c9d8438981 Mon Sep 17 00:00:00 2001 From: Qiongsi Wu Date: Thu, 2 Aug 2018 15:51:31 -0400 Subject: [PATCH 05/15] PS2 shared memory - not much speedup --- Problem Sets/Problem Set 2/student_func.cu | 23 ++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/Problem Sets/Problem Set 2/student_func.cu b/Problem Sets/Problem Set 2/student_func.cu index 870fb592..06583f3c 100755 --- a/Problem Sets/Problem Set 2/student_func.cu +++ b/Problem Sets/Problem Set 2/student_func.cu @@ -101,6 +101,7 @@ //**************************************************************************** #include "utils.h" +#include "stdio.h" __global__ void gaussian_blur(const unsigned char* const inputChannel, @@ -121,6 +122,16 @@ void gaussian_blur(const unsigned char* const inputChannel, const int2 thread_2D_pos = make_int2( blockIdx.x * blockDim.x + threadIdx.x, blockIdx.y * blockDim.y + threadIdx.y); + // optimization with shared memory + // filterWidth is 9 in this example, so it fits in the shared + // memory (size of 49152 bytes for P4000). + extern __shared__ float s_filter[]; + if (threadIdx.x < filterWidth && threadIdx.y < filterWidth) { + int idx = threadIdx.y * filterWidth + threadIdx.x; + s_filter[idx] = filter[idx]; + } + __syncthreads(); + //make sure we don't try and access memory outside the image //by having any threads mapped there return early if (thread_2D_pos.x >= numCols || thread_2D_pos.y >= numRows) @@ -140,7 +151,7 @@ void gaussian_blur(const unsigned char* const inputChannel, int filterOffset = (filter_r + filterWidth/2) * filterWidth + filter_c + filterWidth/2; - float filterValue = filter[filterOffset]; + float filterValue = s_filter[filterOffset]; result += imageValue * filterValue; } } @@ -273,11 +284,15 @@ void your_gaussian_blur(const uchar4 * const h_inputImageRGBA, uchar4 * const d_ cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); //Call your convolution kernel here 3 times, once for each color channel. - gaussian_blur<<>>(d_red, d_redBlurred, numRows, numCols, + int sharedMemSize = sizeof(float) * filterWidth * filterWidth; + gaussian_blur<<>>(d_red, d_redBlurred, + numRows, numCols, d_filter, filterWidth); - gaussian_blur<<>>(d_green, d_greenBlurred, numRows, numCols, + gaussian_blur<<>>(d_green, d_greenBlurred, + numRows, numCols, d_filter, filterWidth); - gaussian_blur<<>>(d_blue, d_blueBlurred, numRows, numCols, + gaussian_blur<<>>(d_blue, d_blueBlurred, + numRows, numCols, d_filter, filterWidth); // Again, call cudaDeviceSynchronize(), then call checkCudaErrors() immediately after From 2ac8190b18b7ffc6b5fdb3663b56fbd7e0a9f91a Mon Sep 17 00:00:00 2001 From: Qiongsi Wu Date: Fri, 3 Aug 2018 13:39:04 -0400 Subject: [PATCH 06/15] PS3 steps 1 and 2 --- Problem Sets/Problem Set 3/student_func.cu | 170 +++++++++++++++++++++ 1 file changed, 170 insertions(+) diff --git a/Problem Sets/Problem Set 3/student_func.cu b/Problem Sets/Problem Set 3/student_func.cu index 26f00a74..64a2a0c1 100755 --- a/Problem Sets/Problem Set 3/student_func.cu +++ b/Problem Sets/Problem Set 3/student_func.cu @@ -80,6 +80,141 @@ */ #include "utils.h" +#include "float.h" + +#include "stdio.h" + +#define THREADS_PER_BLOCK 1024 + +// This function assumes that the blocks +// and the grids are 1-D and +// blockDim.x is a power of 2. +__global__ void g_reduce_max(float* d_out, + const float* const d_in, + const size_t size) +{ + extern __shared__ float sdata[]; + + int myId = blockDim.x * blockIdx.x + threadIdx.x; + int tid = threadIdx.x; + + float value = (myId < size)? d_in[myId] : FLT_MIN; + sdata[tid] = value; + __syncthreads(); + + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) + { + if (tid < s) { + sdata[tid] = max(sdata[tid], sdata[tid + s]); + } + + __syncthreads(); + } + + if (tid == 0) + { + d_out[blockIdx.x] = sdata[0]; + } +} + +// reduce min. This is essentially a duplicate of g_reduce_max. +// Is there a way to pass in a function pointer? +__global__ void g_reduce_min(float* d_out, + const float* const d_in, + const size_t size) +{ + extern __shared__ float sdata[]; + + int myId = blockDim.x * blockIdx.x + threadIdx.x; + int tid = threadIdx.x; + + float value = (myId < size)? d_in[myId] : FLT_MAX; + sdata[tid] = value; + __syncthreads(); + + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) + { + if (tid < s) { + sdata[tid] = min(sdata[tid], sdata[tid + s]); + } + __syncthreads(); + } + + if (tid == 0) + { + d_out[blockIdx.x] = sdata[0]; + } +} + +// Helper function to find the smallest power of 2 bigger than an +// unsigned int input. +unsigned nextPow2(unsigned n) { + if (!(n & (n - 1))) return n; + unsigned count = 0; + while (n != 0) + { + n >>= 1; + count++; + } + + return 1 << count; +} + +// optn == 0 min +// optn != 0 max +// function assumes size <= 2^20 +float reduce_extrema(const float* const d_in, const size_t size, int optn) { + unsigned threadsPerBlock = THREADS_PER_BLOCK; + unsigned numGrids = (size + threadsPerBlock - 1) / threadsPerBlock; + const dim3 blockSize(threadsPerBlock, 1, 1); + const dim3 gridSize(numGrids, 1, 1); + + float* d_intermediate; + float* d_result; + checkCudaErrors(cudaMalloc((void **) &d_intermediate, + numGrids * sizeof(float))); + checkCudaErrors(cudaMalloc((void **) &d_result, + sizeof(float))); + + size_t sharedMemSize = threadsPerBlock * sizeof(float); + + if (optn == 0) + { + g_reduce_min<<>>(d_intermediate, + d_in, size); + } else + { + g_reduce_max<<>>(d_intermediate, + d_in, size); + } + + + // call g_reduce a second time to process the results from + // each block of the previous call. + unsigned paddedNumThreads = nextPow2(numGrids); + sharedMemSize = paddedNumThreads * sizeof(float); + + if (optn == 0) + { + g_reduce_min<<<1, paddedNumThreads, sharedMemSize>>>(d_result, + d_intermediate, + numGrids); + } else + { + g_reduce_max<<<1, paddedNumThreads, sharedMemSize>>>(d_result, + d_intermediate, + numGrids); + } + + float h_result; + checkCudaErrors(cudaMemcpy(&h_result, d_result, sizeof(float), + cudaMemcpyDeviceToHost)); + + checkCudaErrors(cudaFree(d_intermediate)); + checkCudaErrors(cudaFree(d_result)); + + return h_result; +} void your_histogram_and_prefixsum(const float* const d_logLuminance, unsigned int* const d_cdf, @@ -100,5 +235,40 @@ void your_histogram_and_prefixsum(const float* const d_logLuminance, the cumulative distribution of luminance values (this should go in the incoming d_cdf pointer which already has been allocated for you) */ + // d_logLuninance is more like a 1-d structure. So we flatten everything. + // int threadsPerBlock = 1024; + size_t size = numRows * numCols; + + printf("total size: %lu\n", size); + + // Step 1 compute the minimum and maximum. + float *h_logLuminance = (float *)malloc(sizeof(float) * size); + cudaMemcpy(h_logLuminance, d_logLuminance, sizeof(float) * size, + cudaMemcpyDeviceToHost); + + float real_max, real_min; + real_max = FLT_MIN; + real_min = FLT_MAX; + size_t max_idx, min_idx; + max_idx = 0; + min_idx = 0; + for (size_t i = 0; i < size; i++) { + if (h_logLuminance[i] >= real_max) + max_idx = i; + real_max = max(real_max, h_logLuminance[i]); + if (h_logLuminance[i] <= real_max) + min_idx = i; + real_min = min(real_min, h_logLuminance[i]); + } + + printf("ref max: %f at %lu\n", real_max, max_idx); + printf("ref min: %f at %lu\n", real_min, min_idx); + + float max = reduce_extrema(d_logLuminance, size, 1); + float min = reduce_extrema(d_logLuminance, size, 0); + printf("testing max: %f\n", max); + printf("testing min: %f\n", min); + // Step 2 compute the difference to find the range + float range = max - min; } From 3eaf8ab6f26e7b8217c43793b3b985b3b8992865 Mon Sep 17 00:00:00 2001 From: Qiongsi Wu Date: Fri, 3 Aug 2018 15:11:43 -0400 Subject: [PATCH 07/15] PS3 step 3 --- Problem Sets/Problem Set 3/student_func.cu | 63 +++++++++++++--------- 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/Problem Sets/Problem Set 3/student_func.cu b/Problem Sets/Problem Set 3/student_func.cu index 64a2a0c1..a61cecc0 100755 --- a/Problem Sets/Problem Set 3/student_func.cu +++ b/Problem Sets/Problem Set 3/student_func.cu @@ -86,6 +86,17 @@ #define THREADS_PER_BLOCK 1024 +// utility for debugging +__global__ void printIntArray(int *d_array, int size) +{ + if (threadIdx.x != 0) return; + + for (int i = 0; i < size; i++) + { + printf("%d\t:\t%d\n", i, d_array[i]); + } +} + // This function assumes that the blocks // and the grids are 1-D and // blockDim.x is a power of 2. @@ -216,6 +227,17 @@ float reduce_extrema(const float* const d_in, const size_t size, int optn) { return h_result; } +__global__ void simple_hdr_histo(int *d_bins, const float *d_in, + const int numBins, + float min, float range) +{ + int myId = threadIdx.x + blockDim.x * blockIdx.x; + int myItem = d_in[myId]; + int myBin = (myItem - min) / range * numBins; + atomicAdd(&(d_bins[myBin]), 1); +} + + void your_histogram_and_prefixsum(const float* const d_logLuminance, unsigned int* const d_cdf, float &min_logLum, @@ -238,37 +260,26 @@ void your_histogram_and_prefixsum(const float* const d_logLuminance, // d_logLuninance is more like a 1-d structure. So we flatten everything. // int threadsPerBlock = 1024; size_t size = numRows * numCols; - - printf("total size: %lu\n", size); // Step 1 compute the minimum and maximum. - float *h_logLuminance = (float *)malloc(sizeof(float) * size); - cudaMemcpy(h_logLuminance, d_logLuminance, sizeof(float) * size, - cudaMemcpyDeviceToHost); - - float real_max, real_min; - real_max = FLT_MIN; - real_min = FLT_MAX; - size_t max_idx, min_idx; - max_idx = 0; - min_idx = 0; - for (size_t i = 0; i < size; i++) { - if (h_logLuminance[i] >= real_max) - max_idx = i; - real_max = max(real_max, h_logLuminance[i]); - if (h_logLuminance[i] <= real_max) - min_idx = i; - real_min = min(real_min, h_logLuminance[i]); - } - - printf("ref max: %f at %lu\n", real_max, max_idx); - printf("ref min: %f at %lu\n", real_min, min_idx); float max = reduce_extrema(d_logLuminance, size, 1); float min = reduce_extrema(d_logLuminance, size, 0); - printf("testing max: %f\n", max); - printf("testing min: %f\n", min); // Step 2 compute the difference to find the range - float range = max - min; + float range = max - min; + + // Step 3 generate histogram. + int numThreads = THREADS_PER_BLOCK; + int numBlocks = (size + numThreads - 1) / numThreads; + int* d_bins; + checkCudaErrors(cudaMalloc((void **)&d_bins, sizeof(int) * numBins)); + cudaMemset(d_bins, 0, sizeof(int) * numBins); + simple_hdr_histo<<>>(d_bins, d_logLuminance, numBins, + min, range); + + // Step 4 the exclusive scan - assume numBins is a power of 2. + + // Cleaning up + checkCudaErrors(cudaFree(d_bins)); } From ed5438a209dccdba3d2eb8f08088961e1cdbca9f Mon Sep 17 00:00:00 2001 From: Qiongsi Wu Date: Mon, 6 Aug 2018 14:15:30 -0400 Subject: [PATCH 08/15] PS3 step 4 something is still not correct --- Problem Sets/Problem Set 3/student_func.cu | 112 ++++++++++++++++++--- 1 file changed, 97 insertions(+), 15 deletions(-) diff --git a/Problem Sets/Problem Set 3/student_func.cu b/Problem Sets/Problem Set 3/student_func.cu index a61cecc0..4a389691 100755 --- a/Problem Sets/Problem Set 3/student_func.cu +++ b/Problem Sets/Problem Set 3/student_func.cu @@ -92,11 +92,22 @@ __global__ void printIntArray(int *d_array, int size) if (threadIdx.x != 0) return; for (int i = 0; i < size; i++) - { + { printf("%d\t:\t%d\n", i, d_array[i]); } } +__global__ void printUnsignedIntArray(unsigned int *d_array, int size) +{ + if (threadIdx.x != 0) return; + + for (int i = 0; i < size; i++) + { + if (d_array[i] != 0) + printf("%d\t:\t%u\n", i, d_array[i]); + } +} + // This function assumes that the blocks // and the grids are 1-D and // blockDim.x is a power of 2. @@ -227,16 +238,83 @@ float reduce_extrema(const float* const d_in, const size_t size, int optn) { return h_result; } -__global__ void simple_hdr_histo(int *d_bins, const float *d_in, +__global__ void simple_hdr_histo(unsigned int *d_bins, const float *d_in, const int numBins, - float min, float range) + float min_val, float range) { int myId = threadIdx.x + blockDim.x * blockIdx.x; - int myItem = d_in[myId]; - int myBin = (myItem - min) / range * numBins; + float myItem = d_in[myId]; + unsigned int myBin = min((unsigned int)(numBins - 1), + (unsigned int)((myItem - min_val) / range * numBins)); atomicAdd(&(d_bins[myBin]), 1); } +// Simple implementation of Blelloch Scan. +// This function assumes the number of blocks is 1. +// In another word, gridDim.x == 1 +// Addtionally, it assumes the number of threads per block +// is a power of 2. +// The shared data required is of size sizeof(int) * blockDim.x . +__global__ void excl_prefix_sum(unsigned int* const d_cdf, + const unsigned int* const d_bins, + const size_t size) +{ + extern __shared__ unsigned int idata[]; + + // because blockIdx.x == 0, we do not need blockDim offset. + int tid = threadIdx.x; + + idata[tid] = (tid < size)? d_bins[tid] : 0; + __syncthreads(); + + // summing up + for (unsigned int s = 1; s < blockDim.x; s <<= 1) + { + unsigned int temp = 0; + if ((tid + 1) % (2 * s) == 0) + temp = idata[tid - s]; + __syncthreads(); + + if ((tid + 1) % (2 * s) == 0) + idata[tid] += temp; + __syncthreads(); + } + + __syncthreads(); + // set max idx to identity + if (tid == blockDim.x - 1) + { + idata[tid] = 0; + } + + __syncthreads(); + + // downward sweep + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) + { + int temp1 = 0; + int temp2 = 0; + + if ((tid + 1) % (2 * s) == 0) + { + temp1 = idata[tid]; + temp2 = idata[tid - s]; + } + __syncthreads(); + + if ((tid + 1) % (2 * s) == 0) + { + idata[tid] += temp2; + idata[tid - s] = temp1; + } + __syncthreads(); + } + + if (tid < size) + { + d_cdf[tid] = idata[tid]; + } +} void your_histogram_and_prefixsum(const float* const d_logLuminance, unsigned int* const d_cdf, @@ -246,7 +324,6 @@ void your_histogram_and_prefixsum(const float* const d_logLuminance, const size_t numCols, const size_t numBins) { - //TODO /*Here are the steps you need to implement 1) find the minimum and maximum value in the input logLuminance channel store in min_logLum and max_logLum @@ -263,23 +340,28 @@ void your_histogram_and_prefixsum(const float* const d_logLuminance, // Step 1 compute the minimum and maximum. - float max = reduce_extrema(d_logLuminance, size, 1); - float min = reduce_extrema(d_logLuminance, size, 0); + float maxLum = reduce_extrema(d_logLuminance, size, 1); + float minLum = reduce_extrema(d_logLuminance, size, 0); + + printf("GPU min: %f\n", minLum); + printf("GPU max: %f\n", maxLum); // Step 2 compute the difference to find the range - float range = max - min; + float range = maxLum - minLum; // Step 3 generate histogram. int numThreads = THREADS_PER_BLOCK; int numBlocks = (size + numThreads - 1) / numThreads; - int* d_bins; - checkCudaErrors(cudaMalloc((void **)&d_bins, sizeof(int) * numBins)); - cudaMemset(d_bins, 0, sizeof(int) * numBins); + unsigned int* d_bins; + checkCudaErrors(cudaMalloc((void **)&d_bins, sizeof(unsigned int) * numBins)); + cudaMemset(d_bins, 0, sizeof(unsigned int) * numBins); simple_hdr_histo<<>>(d_bins, d_logLuminance, numBins, - min, range); + minLum, range); - // Step 4 the exclusive scan - assume numBins is a power of 2. - + // Step 4 the exclusive scan - assume numBins is a power of 2. + excl_prefix_sum<<<1, numBins, sizeof(unsigned int) * numBins>>>(d_cdf, + d_bins, + numBins); // Cleaning up checkCudaErrors(cudaFree(d_bins)); } From de5c287653d14a80d98a469656f15e4fd82714a6 Mon Sep 17 00:00:00 2001 From: Qiongsi Wu Date: Mon, 6 Aug 2018 14:42:56 -0400 Subject: [PATCH 09/15] PS3 fix a bug --- Problem Sets/Problem Set 3/student_func.cu | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/Problem Sets/Problem Set 3/student_func.cu b/Problem Sets/Problem Set 3/student_func.cu index 4a389691..41dc34fb 100755 --- a/Problem Sets/Problem Set 3/student_func.cu +++ b/Problem Sets/Problem Set 3/student_func.cu @@ -340,14 +340,11 @@ void your_histogram_and_prefixsum(const float* const d_logLuminance, // Step 1 compute the minimum and maximum. - float maxLum = reduce_extrema(d_logLuminance, size, 1); - float minLum = reduce_extrema(d_logLuminance, size, 0); - - printf("GPU min: %f\n", minLum); - printf("GPU max: %f\n", maxLum); + max_logLum = reduce_extrema(d_logLuminance, size, 1); + min_logLum = reduce_extrema(d_logLuminance, size, 0); // Step 2 compute the difference to find the range - float range = maxLum - minLum; + float range = max_logLum - min_logLum; // Step 3 generate histogram. int numThreads = THREADS_PER_BLOCK; @@ -356,7 +353,7 @@ void your_histogram_and_prefixsum(const float* const d_logLuminance, checkCudaErrors(cudaMalloc((void **)&d_bins, sizeof(unsigned int) * numBins)); cudaMemset(d_bins, 0, sizeof(unsigned int) * numBins); simple_hdr_histo<<>>(d_bins, d_logLuminance, numBins, - minLum, range); + min_logLum, range); // Step 4 the exclusive scan - assume numBins is a power of 2. excl_prefix_sum<<<1, numBins, sizeof(unsigned int) * numBins>>>(d_cdf, From 6ba11af29501763d202e4ff8b1c30f1abcff96cc Mon Sep 17 00:00:00 2001 From: Qiongsi Wu Date: Thu, 9 Aug 2018 11:16:34 -0400 Subject: [PATCH 10/15] PS4 histogram, scan and some debug changes --- Problem Sets/Problem Set 4/main.cpp | 2 +- Problem Sets/Problem Set 4/student_func.cu | 216 +++++++++++++++++++-- 2 files changed, 203 insertions(+), 15 deletions(-) diff --git a/Problem Sets/Problem Set 4/main.cpp b/Problem Sets/Problem Set 4/main.cpp index 146c8673..f66bc5c6 100755 --- a/Problem Sets/Problem Set 4/main.cpp +++ b/Problem Sets/Problem Set 4/main.cpp @@ -115,7 +115,7 @@ int main(int argc, char **argv) { &h_outputVals[0], &h_outputPos[0], numElems); - //postProcess(valsPtr, posPtr, numElems, reference_file); + //postProcess(&h_outputVals[0], &h_outputPos[0], numElems, reference_file); //compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError); diff --git a/Problem Sets/Problem Set 4/student_func.cu b/Problem Sets/Problem Set 4/student_func.cu index 347d7b6e..7827dfad 100755 --- a/Problem Sets/Problem Set 4/student_func.cu +++ b/Problem Sets/Problem Set 4/student_func.cu @@ -1,12 +1,13 @@ -//Udacity HW 4 -//Radix Sorting +// Udacity HW 4 +// Radix Sorting -#include "utils.h" #include +#include "string.h" +#include "utils.h" /* Red Eye Removal =============== - + For this assignment we are implementing red eye removal. This is accomplished by first creating a score for every pixel that tells us how likely it is to be a red eye pixel. We have already done this for you - you @@ -30,8 +31,8 @@ 1) Histogram of the number of occurrences of each digit 2) Exclusive Prefix Sum of Histogram 3) Determine relative offset of each digit - For example [0 0 1 1 0 0 1] - -> [0 1 0 1 2 3 2] + For example [0 0 1 1 0 0 1] + -> [0 1 0 1 2 3 2] 4) Combine the results of steps 2 & 3 to determine the final output location for each element and move it there @@ -42,13 +43,200 @@ */ +// This function assumes that there are two bins! +// The mask is used to pick the bit to compute the +// index of the bin. +__global__ void simple_hist(unsigned int *const d_bins, + const unsigned int *const d_in, const size_t size, + unsigned int mask, unsigned int pos) { + size_t i = blockDim.x * blockIdx.x + threadIdx.x; + if (i >= size) return; + unsigned int binIdx = ((d_in[i] & mask) >> pos); + atomicAdd(&(d_bins[binIdx]), 1); +} + +// Exclusive scan - Naive Hillis and Steele. +// Call with gridSize 1 +// and shared memory size blockSize * sizeof(unsigned int) +__global__ void naive_scan(unsigned int *const d_out, + const unsigned int *const d_in, const size_t size) { + extern __shared__ unsigned int sdata[]; + int tid = threadIdx.x; + sdata[tid] = (tid >= 1 && tid < size) ? d_in[tid - 1] : 0; + + for (unsigned int s = 1; s < blockDim.x; s <<= 1) { + unsigned int a = 0; + unsigned int b = 0; + if (tid >= 2 * s) { + a = sdata[tid - s]; + b = sdata[tid]; + } + __syncthreads(); + + if (tid >= 2 * s) sdata[tid] = a + b; + __syncthreads(); + } + + if (tid >= size) return; + d_out[tid] = sdata[tid]; +} + +void your_sort(unsigned int *const d_inputVals, unsigned int *const d_inputPos, + unsigned int *const d_outputVals, + unsigned int *const d_outputPos, const size_t numElems) { + // Temporary testing area... + // Setting up data + size_t testSize = 8; + size_t testMemSize = testSize * sizeof(unsigned int); + unsigned int *h_test_in = (unsigned int *)malloc(testMemSize); + unsigned int *h_test_out = (unsigned int *)malloc(testMemSize); + for (unsigned int i = 0; i < testSize; i++) { + h_test_in[i] = i + 1; + } + // CPU reference result + h_test_out[0] = 0; + for (unsigned int i = 1; i < testSize; i++) { + h_test_out[i] = h_test_in[i- 1] + h_test_out[i - 1]; + } + + // GPU test + unsigned int *d_test_in; + unsigned int *d_test_out; + unsigned int *h_gpu_out; + checkCudaErrors(cudaMalloc((void **)&d_test_in, testMemSize)); + checkCudaErrors(cudaMalloc((void **)&d_test_out, testMemSize)); + h_gpu_out = (unsigned int *)malloc(testMemSize); + + checkCudaErrors(cudaMemcpy(d_test_in, h_test_in, testMemSize, cudaMemcpyHostToDevice)); + + naive_scan<<<1, 8, 8 * sizeof(unsigned int)>>>(d_test_out, d_test_in, testSize); + + checkCudaErrors(cudaMemcpy(h_gpu_out, d_test_out, testMemSize, cudaMemcpyDeviceToHost)); + + for (unsigned int i = 0; i < testSize; i++) { + unsigned int gpu = h_gpu_out[i]; + unsigned int cpu = h_test_out[i]; + if (gpu != cpu) { + printf("scan gpu[%u]:%u\t cpu[%u]%u\n", i, gpu, i, cpu); + } + } + + // cleaning up + free(h_test_in); + free(h_test_out); + free(h_gpu_out); + checkCudaErrors(cudaFree(d_test_in)); + checkCudaErrors(cudaFree(d_test_out)); + + // end of testing area... + const int numBits = 1; + const int numBins = 1 << numBits; + + size_t memSize = sizeof(unsigned int) * numElems; + + // CPU code for testing + unsigned int *binHistogram = new unsigned int[numBins]; + unsigned int *binScan = new unsigned int[numBins]; + + unsigned int *vals_src = new unsigned int[numElems]; + unsigned int *pos_src = new unsigned int[numElems]; + + unsigned int *vals_dst = new unsigned int[numElems]; + unsigned int *pos_dst = new unsigned int[numElems]; + + checkCudaErrors( + cudaMemcpy(vals_src, d_inputVals, memSize, cudaMemcpyDeviceToHost)); + checkCudaErrors( + cudaMemcpy(pos_src, d_inputPos, memSize, cudaMemcpyDeviceToHost)); + + // Temporary CPU arrays + unsigned int *h_binHistogram = new unsigned int[numBins]; + unsigned int *h_binScan = new unsigned int[numBins]; + + // GPU arrays + unsigned int *d_binHistogram; + unsigned int *d_binScan; + size_t g_memSize = sizeof(unsigned int) * numBins; + checkCudaErrors(cudaMalloc((void **)&d_binHistogram, g_memSize)); + checkCudaErrors(cudaMalloc((void **)&d_binScan, g_memSize)); + + // GPU temporary pointers + unsigned int *d_vals_src = d_inputVals; + unsigned int *d_pos_src = d_inputPos; + unsigned int *d_vals_dst = d_outputVals; + unsigned int *d_pos_dst = d_outputPos; + + // GPU kernel dimensions + unsigned int numThreads = 1024; + unsigned int numBlocks = (numElems + numThreads - 1) / numThreads; + dim3 blockSize(numThreads, 1, 1); + dim3 gridSize(numBlocks, 1, 1); + + // a simple radix sort - only guaranteed to work for numBits that are + // multiples of 2 + // main loop + for (unsigned int i = 0; i < 8 * sizeof(unsigned int); i += numBits) { + unsigned int mask = (numBins - 1) << i; + + memset(binHistogram, 0, + sizeof(unsigned int) * numBins); // zero out the bins + memset(binScan, 0, sizeof(unsigned int) * numBins); // zero out the + // bins + + // Reset GPU arrays + checkCudaErrors(cudaMemset(d_binHistogram, 0, g_memSize)); + checkCudaErrors(cudaMemset(d_binScan, 0, g_memSize)); + + // CPU histogram + // perform histogram of data & mask into bins + for (unsigned int j = 0; j < numElems; ++j) { + unsigned int bin = (vals_src[j] & mask) >> i; + binHistogram[bin]++; + } + + // GPU histogram + simple_hist<<>>(d_binHistogram, d_vals_src, + numElems, mask, i); + checkCudaErrors(cudaMemcpy(h_binHistogram, d_binHistogram, g_memSize, + cudaMemcpyDeviceToHost)); + // check GPU results + for (unsigned int j = 0; j < numBins; j++) { + if (h_binHistogram[j] != binHistogram[j]) { + printf("GPU hist[%d]:%u\tCPU hist[%d]:%u\n", j, + h_binHistogram[j], j, binHistogram[j]); + } + } + + // CPU scan + // perform exclusive prefix sum (scan) on binHistogram to get starting + // location for each bin + for (unsigned int j = 1; j < numBins; ++j) { + binScan[j] = binScan[j - 1] + binHistogram[j - 1]; + } + + // GPU scan + /*__global__ void naive_scan(unsigned int* const d_out, + const unsigned int* const d_in, + const size_t size)*/ + + // Gather everything into the correct location + // need to move vals and positions + for (unsigned int j = 0; j < numElems; ++j) { + unsigned int bin = (vals_src[j] & mask) >> i; + vals_dst[binScan[bin]] = vals_src[j]; + pos_dst[binScan[bin]] = pos_src[j]; + binScan[bin]++; + } + + // swap the buffers (pointers only) + std::swap(vals_dst, vals_src); + std::swap(pos_dst, pos_src); + } + + // we did an even number of iterations, need to copy from input buffer into + // output std::copy(inputVals, inputVals + numElems, outputVals); + // std::copy(inputPos, inputPos + numElems, outputPos); -void your_sort(unsigned int* const d_inputVals, - unsigned int* const d_inputPos, - unsigned int* const d_outputVals, - unsigned int* const d_outputPos, - const size_t numElems) -{ - //TODO - //PUT YOUR SORT HERE + delete[] binHistogram; + delete[] binScan; } From 6ac936df19b7d48cc7bc48f4ddae349676226fb3 Mon Sep 17 00:00:00 2001 From: Qiongsi Wu Date: Thu, 9 Aug 2018 11:45:12 -0400 Subject: [PATCH 11/15] PS4 scan works --- Problem Sets/Problem Set 4/student_func.cu | 32 +++++++++++++++------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/Problem Sets/Problem Set 4/student_func.cu b/Problem Sets/Problem Set 4/student_func.cu index 7827dfad..02a78518 100755 --- a/Problem Sets/Problem Set 4/student_func.cu +++ b/Problem Sets/Problem Set 4/student_func.cu @@ -65,16 +65,16 @@ __global__ void naive_scan(unsigned int *const d_out, sdata[tid] = (tid >= 1 && tid < size) ? d_in[tid - 1] : 0; for (unsigned int s = 1; s < blockDim.x; s <<= 1) { - unsigned int a = 0; - unsigned int b = 0; - if (tid >= 2 * s) { - a = sdata[tid - s]; - b = sdata[tid]; - } - __syncthreads(); + unsigned int a = 0; + unsigned int b = 0; + if (tid >= s) { + a = sdata[tid - s]; + b = sdata[tid]; + } + __syncthreads(); - if (tid >= 2 * s) sdata[tid] = a + b; - __syncthreads(); + if (tid >= s) sdata[tid] = a + b; + __syncthreads(); } if (tid >= size) return; @@ -129,7 +129,8 @@ void your_sort(unsigned int *const d_inputVals, unsigned int *const d_inputPos, checkCudaErrors(cudaFree(d_test_out)); // end of testing area... - const int numBits = 1; + + const int numBits = 4; const int numBins = 1 << numBits; size_t memSize = sizeof(unsigned int) * numElems; @@ -218,7 +219,18 @@ void your_sort(unsigned int *const d_inputVals, unsigned int *const d_inputPos, /*__global__ void naive_scan(unsigned int* const d_out, const unsigned int* const d_in, const size_t size)*/ + naive_scan<<<1, numBins, numBins * sizeof(unsigned int)>>>(d_binScan, d_binHistogram, numBins); + checkCudaErrors(cudaMemcpy(h_binScan, d_binScan, numBins * sizeof(unsigned int), + cudaMemcpyDeviceToHost)); + // check GPU results + for (unsigned int j = 0; j < numBins; j++) { + if (h_binScan[j] != binScan[j]) { + printf("GPU scan[%d]:%u\tCPU scan[%d]:%u\n", j, + h_binHistogram[j], j, binHistogram[j]); + } + } + // Gather everything into the correct location // need to move vals and positions for (unsigned int j = 0; j < numElems; ++j) { From 330a721c6446abd5ecdb2ed98039aec2e4a4f16b Mon Sep 17 00:00:00 2001 From: Qiongsi Wu Date: Mon, 13 Aug 2018 12:38:06 -0400 Subject: [PATCH 12/15] PS4 main loop seems to be correct --- Problem Sets/Problem Set 4/student_func.cu | 558 +++++++++++++++------ 1 file changed, 410 insertions(+), 148 deletions(-) diff --git a/Problem Sets/Problem Set 4/student_func.cu b/Problem Sets/Problem Set 4/student_func.cu index 02a78518..2181c341 100755 --- a/Problem Sets/Problem Set 4/student_func.cu +++ b/Problem Sets/Problem Set 4/student_func.cu @@ -14,32 +14,32 @@ are receiving the scores and need to sort them in ascending order so that we know which pixels to alter to remove the red eye. - Note: ascending order == smallest to largest +Note: ascending order == smallest to largest - Each score is associated with a position, when you sort the scores, you must - also move the positions accordingly. +Each score is associated with a position, when you sort the scores, you must +also move the positions accordingly. - Implementing Parallel Radix Sort with CUDA - ========================================== +Implementing Parallel Radix Sort with CUDA +========================================== - The basic idea is to construct a histogram on each pass of how many of each - "digit" there are. Then we scan this histogram so that we know where to put - the output of each digit. For example, the first 1 must come after all the - 0s so we have to know how many 0s there are to be able to start moving 1s - into the correct position. +The basic idea is to construct a histogram on each pass of how many of each +"digit" there are. Then we scan this histogram so that we know where to put +the output of each digit. For example, the first 1 must come after all the +0s so we have to know how many 0s there are to be able to start moving 1s +into the correct position. - 1) Histogram of the number of occurrences of each digit - 2) Exclusive Prefix Sum of Histogram - 3) Determine relative offset of each digit - For example [0 0 1 1 0 0 1] - -> [0 1 0 1 2 3 2] - 4) Combine the results of steps 2 & 3 to determine the final - output location for each element and move it there +1) Histogram of the number of occurrences of each digit +2) Exclusive Prefix Sum of Histogram +3) Determine relative offset of each digit +For example [0 0 1 1 0 0 1] +-> [0 1 0 1 2 3 2] +4) Combine the results of steps 2 & 3 to determine the final +output location for each element and move it there - LSB Radix sort is an out-of-place sort and you will need to ping-pong values - between the input and output buffers we have provided. Make sure the final - sorted results end up in the output buffer! Hint: You may need to do a copy - at the end. +LSB Radix sort is an out-of-place sort and you will need to ping-pong values +between the input and output buffers we have provided. Make sure the final +sorted results end up in the output buffer! Hint: You may need to do a copy +at the end. */ @@ -47,8 +47,8 @@ // The mask is used to pick the bit to compute the // index of the bin. __global__ void simple_hist(unsigned int *const d_bins, - const unsigned int *const d_in, const size_t size, - unsigned int mask, unsigned int pos) { + const unsigned int *const d_in, const size_t size, + unsigned int mask, unsigned int pos) { size_t i = blockDim.x * blockIdx.x + threadIdx.x; if (i >= size) return; unsigned int binIdx = ((d_in[i] & mask) >> pos); @@ -59,76 +59,240 @@ __global__ void simple_hist(unsigned int *const d_bins, // Call with gridSize 1 // and shared memory size blockSize * sizeof(unsigned int) __global__ void naive_scan(unsigned int *const d_out, - const unsigned int *const d_in, const size_t size) { + const unsigned int *const d_in, const size_t size) { extern __shared__ unsigned int sdata[]; int tid = threadIdx.x; sdata[tid] = (tid >= 1 && tid < size) ? d_in[tid - 1] : 0; for (unsigned int s = 1; s < blockDim.x; s <<= 1) { - unsigned int a = 0; - unsigned int b = 0; - if (tid >= s) { - a = sdata[tid - s]; - b = sdata[tid]; - } - __syncthreads(); - - if (tid >= s) sdata[tid] = a + b; - __syncthreads(); + unsigned int a = 0; + unsigned int b = 0; + if (tid >= s) { + a = sdata[tid - s]; + b = sdata[tid]; + } + __syncthreads(); + + if (tid >= s) sdata[tid] = a + b; + __syncthreads(); } if (tid >= size) return; d_out[tid] = sdata[tid]; } +size_t nextPow2(size_t x) { + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + x |= x >> 32; + return ++x; +} + +// naive inclusive scan across blocks +__global__ void naive_inclusive_scan_per_block(unsigned int *const d_out, + unsigned int *const d_intermediate, + const unsigned int *const d_in, const size_t size) { + extern __shared__ unsigned int sdata[]; + int idx = blockDim.x * blockIdx.x + threadIdx.x; + int tid = threadIdx.x; + sdata[tid] = (idx < size) ? d_in[idx] : 0; + + __syncthreads(); + + for (unsigned int s = 1; s < blockDim.x; s <<= 1) { + unsigned int a = 0; + unsigned int b = 0; + if (tid >= s) { + a = sdata[tid - s]; + b = sdata[tid]; + } + __syncthreads(); + + if (tid >= s) sdata[tid] = a + b; + __syncthreads(); + } + + __syncthreads(); + + if (tid >= size) return; + + d_out[idx] = sdata[tid]; + + if (tid == blockDim.x - 1) { + d_intermediate[blockIdx.x] = sdata[tid]; + } +} + +__global__ void scatter_for_multiBlockScan(unsigned int *const d_out, + const unsigned int *const d_in, + const unsigned int *const d_interm_accum, + const size_t size) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + if (idx >= size) return; + unsigned int self = d_in[idx]; + unsigned int inc = d_interm_accum[blockIdx.x]; + d_out[idx] += inc - self; +} + + +__global__ void printUnsignedIntArray(const unsigned int *const arr, + size_t size) { + if (threadIdx.x == 0) { + for (size_t i = 0; i < size; i++) { + printf("arr[%lu]:\t%u\n", i, arr[i]); + } + } +} + +// Exclusive scan for sizes <= 2^20 +void multiBlockScan(unsigned int *const d_out, const unsigned int *const d_in, + const size_t size) { + size_t numThreads = 1024; + size_t numBlocks = (nextPow2(size) + numThreads - 1) / numThreads; + + // allocate temporary GPU arrays + unsigned int *d_intermediate; + unsigned int *d_interm_accum; + checkCudaErrors(cudaMalloc((void **)&d_intermediate, sizeof(unsigned int) * numBlocks)); + checkCudaErrors(cudaMalloc((void **)&d_interm_accum, sizeof(unsigned int) * numBlocks)); + + // inclusive scan to produce prefix sum in each block + naive_inclusive_scan_per_block<<>> + (d_out, d_intermediate, d_in, size); + //printUnsignedIntArray<<<1,1>>>(d_out, size); + + // exclusive scan to produce prefix sum of d_intermediate + naive_scan<<<1, numBlocks, sizeof(unsigned int) * numBlocks>>> + (d_interm_accum, d_intermediate, numBlocks); + + + // add results of d_intermediate back into each block + // and subtract d_in[0] from all elements to get the final results. + scatter_for_multiBlockScan<<>> + (d_out, d_in, d_interm_accum, size); + + checkCudaErrors(cudaFree(d_intermediate)); + checkCudaErrors(cudaFree(d_interm_accum)); +} + +__global__ void map_to_binFlags(unsigned int *const d_binFlags, + const unsigned int *const d_in, unsigned int mask, + unsigned int pos, unsigned int binIdx, const size_t size) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + if (idx >= size) return; + unsigned int bin = ((d_in[idx] & mask) >> pos); + d_binFlags[idx] = (bin == binIdx)? 1 : 0; +} + +__global__ void offset_by_base(unsigned int *const d_out, + const unsigned int *const d_binScan, + unsigned int binIdx, + const size_t size) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + if (idx >= size) return; + unsigned amount = d_binScan[binIdx]; + d_out[idx] = d_out[idx] + amount; +} + +__global__ void reorder(unsigned int *d_out, const unsigned int *const d_in, + const unsigned int *const d_flags, const unsigned int *d_addr, + const size_t size) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + if (idx >= size) return; + if (d_flags[idx]) { + d_out[d_addr[idx]] = d_in[idx]; + } +} + +void swap_device_ptr(unsigned int **d1, unsigned int **d2) { + unsigned int *temp = *d1; + *d1 = *d2; + *d2 = temp; +} + void your_sort(unsigned int *const d_inputVals, unsigned int *const d_inputPos, - unsigned int *const d_outputVals, - unsigned int *const d_outputPos, const size_t numElems) { + unsigned int *const d_outputVals, + unsigned int *const d_outputPos, const size_t numElems) { // Temporary testing area... - // Setting up data - size_t testSize = 8; - size_t testMemSize = testSize * sizeof(unsigned int); - unsigned int *h_test_in = (unsigned int *)malloc(testMemSize); - unsigned int *h_test_out = (unsigned int *)malloc(testMemSize); - for (unsigned int i = 0; i < testSize; i++) { - h_test_in[i] = i + 1; - } - // CPU reference result - h_test_out[0] = 0; - for (unsigned int i = 1; i < testSize; i++) { - h_test_out[i] = h_test_in[i- 1] + h_test_out[i - 1]; - } - - // GPU test - unsigned int *d_test_in; - unsigned int *d_test_out; - unsigned int *h_gpu_out; - checkCudaErrors(cudaMalloc((void **)&d_test_in, testMemSize)); - checkCudaErrors(cudaMalloc((void **)&d_test_out, testMemSize)); - h_gpu_out = (unsigned int *)malloc(testMemSize); - - checkCudaErrors(cudaMemcpy(d_test_in, h_test_in, testMemSize, cudaMemcpyHostToDevice)); - - naive_scan<<<1, 8, 8 * sizeof(unsigned int)>>>(d_test_out, d_test_in, testSize); - - checkCudaErrors(cudaMemcpy(h_gpu_out, d_test_out, testMemSize, cudaMemcpyDeviceToHost)); - - for (unsigned int i = 0; i < testSize; i++) { - unsigned int gpu = h_gpu_out[i]; - unsigned int cpu = h_test_out[i]; - if (gpu != cpu) { - printf("scan gpu[%u]:%u\t cpu[%u]%u\n", i, gpu, i, cpu); - } - } - - // cleaning up - free(h_test_in); - free(h_test_out); - free(h_gpu_out); - checkCudaErrors(cudaFree(d_test_in)); - checkCudaErrors(cudaFree(d_test_out)); - - // end of testing area... + printf("numElems %lu\n", numElems); + // Setting up data + size_t testSize = 512; + size_t testMemSize = testSize * sizeof(unsigned int); + unsigned int *h_test_in = (unsigned int *)malloc(testMemSize); + unsigned int *h_test_out = (unsigned int *)malloc(testMemSize); + for (unsigned int i = 0; i < testSize; i++) { + h_test_in[i] = 1; + } + // CPU reference result + h_test_out[0] = 0; + for (unsigned int i = 1; i < testSize; i++) { + h_test_out[i] = h_test_in[i - 1] + h_test_out[i - 1]; + } + + // GPU test + unsigned int *d_test_in; + unsigned int *d_test_out; + unsigned int *h_gpu_out; + checkCudaErrors(cudaMalloc((void **)&d_test_in, testMemSize)); + checkCudaErrors(cudaMalloc((void **)&d_test_out, testMemSize)); + h_gpu_out = (unsigned int *)malloc(testMemSize); + + checkCudaErrors( + cudaMemcpy(d_test_in, h_test_in, testMemSize, cudaMemcpyHostToDevice)); + + /*naive_scan<<<1, 8, 8 * sizeof(unsigned int)>>>(d_test_out, d_test_in, + testSize);*/ + + multiBlockScan(d_test_out, d_test_in, testSize); + + checkCudaErrors( + cudaMemcpy(h_gpu_out, d_test_out, testMemSize, cudaMemcpyDeviceToHost)); + + for (unsigned int i = 0; i < testSize; i++) { + unsigned int gpu = h_gpu_out[i]; + unsigned int cpu = h_test_out[i]; + if (gpu != cpu) { + printf("multiblock scan gpu[%u]:%u\t cpu[%u]%u\n", i, gpu, i, cpu); + } + } + + unsigned int *d_intermediate; + checkCudaErrors(cudaMalloc((void **)&d_intermediate, 4 * sizeof(unsigned int))); + //naive_inclusive_scan_per_block<<<4, 2, sizeof(unsigned int) * 2>>> + // (d_test_out, d_intermediate, d_test_in, testSize); + + // Testing reordering + /*map_to_binFlags<<>>(d_binFlags, d_vals_src, + mask, i, j, numElems); + + multiBlockScan(d_binAddress, d_binFlags, numElems); + + offset_by_base<<>>(d_binAddress, d_binScan, + j, numElems); + + reorder<<>>(d_vals_dst, d_vals_src, + d_binFlags, d_binAddress, numElems); + + reorder<<>>(d_pos_dst, d_pos_src, + d_binFlags, d_binAddress, numElems); + + // reset d_binFlags after processing for the current + // bin. + checkCudaErrors(cudaMemset(d_binFlags, 0, memSize)); + checkCudaErrors(cudaMemset(d_binAddress, 0, memSize)); */ + + // cleaning up + free(h_test_in); + free(h_test_out); + free(h_gpu_out); + checkCudaErrors(cudaFree(d_test_in)); + checkCudaErrors(cudaFree(d_test_out)); + + // end of testing area... const int numBits = 4; const int numBins = 1 << numBits; @@ -145,14 +309,19 @@ void your_sort(unsigned int *const d_inputVals, unsigned int *const d_inputPos, unsigned int *vals_dst = new unsigned int[numElems]; unsigned int *pos_dst = new unsigned int[numElems]; + unsigned int *binFlags = new unsigned int[numElems]; + unsigned int *binAddress = new unsigned int[numElems]; + checkCudaErrors( - cudaMemcpy(vals_src, d_inputVals, memSize, cudaMemcpyDeviceToHost)); + cudaMemcpy(vals_src, d_inputVals, memSize, cudaMemcpyDeviceToHost)); checkCudaErrors( - cudaMemcpy(pos_src, d_inputPos, memSize, cudaMemcpyDeviceToHost)); + cudaMemcpy(pos_src, d_inputPos, memSize, cudaMemcpyDeviceToHost)); // Temporary CPU arrays unsigned int *h_binHistogram = new unsigned int[numBins]; unsigned int *h_binScan = new unsigned int[numBins]; + unsigned int *h_binFlags = new unsigned int[numElems]; + unsigned int *h_binAddress = new unsigned int[numElems]; // GPU arrays unsigned int *d_binHistogram; @@ -160,6 +329,11 @@ void your_sort(unsigned int *const d_inputVals, unsigned int *const d_inputPos, size_t g_memSize = sizeof(unsigned int) * numBins; checkCudaErrors(cudaMalloc((void **)&d_binHistogram, g_memSize)); checkCudaErrors(cudaMalloc((void **)&d_binScan, g_memSize)); + unsigned int *d_binFlags; // Indicate if a particular value belongs to + // a certain bin + unsigned int *d_binAddress; + checkCudaErrors(cudaMalloc((void **)&d_binFlags, memSize)); + checkCudaErrors(cudaMalloc((void **)&d_binAddress, memSize)); // GPU temporary pointers unsigned int *d_vals_src = d_inputVals; @@ -177,77 +351,165 @@ void your_sort(unsigned int *const d_inputVals, unsigned int *const d_inputPos, // multiples of 2 // main loop for (unsigned int i = 0; i < 8 * sizeof(unsigned int); i += numBits) { - unsigned int mask = (numBins - 1) << i; - - memset(binHistogram, 0, - sizeof(unsigned int) * numBins); // zero out the bins - memset(binScan, 0, sizeof(unsigned int) * numBins); // zero out the - // bins - - // Reset GPU arrays - checkCudaErrors(cudaMemset(d_binHistogram, 0, g_memSize)); - checkCudaErrors(cudaMemset(d_binScan, 0, g_memSize)); - - // CPU histogram - // perform histogram of data & mask into bins - for (unsigned int j = 0; j < numElems; ++j) { - unsigned int bin = (vals_src[j] & mask) >> i; - binHistogram[bin]++; - } - - // GPU histogram - simple_hist<<>>(d_binHistogram, d_vals_src, - numElems, mask, i); - checkCudaErrors(cudaMemcpy(h_binHistogram, d_binHistogram, g_memSize, - cudaMemcpyDeviceToHost)); - // check GPU results - for (unsigned int j = 0; j < numBins; j++) { - if (h_binHistogram[j] != binHistogram[j]) { - printf("GPU hist[%d]:%u\tCPU hist[%d]:%u\n", j, - h_binHistogram[j], j, binHistogram[j]); - } - } - - // CPU scan - // perform exclusive prefix sum (scan) on binHistogram to get starting - // location for each bin - for (unsigned int j = 1; j < numBins; ++j) { - binScan[j] = binScan[j - 1] + binHistogram[j - 1]; - } - - // GPU scan - /*__global__ void naive_scan(unsigned int* const d_out, - const unsigned int* const d_in, - const size_t size)*/ - naive_scan<<<1, numBins, numBins * sizeof(unsigned int)>>>(d_binScan, d_binHistogram, numBins); - - checkCudaErrors(cudaMemcpy(h_binScan, d_binScan, numBins * sizeof(unsigned int), - cudaMemcpyDeviceToHost)); - // check GPU results - for (unsigned int j = 0; j < numBins; j++) { - if (h_binScan[j] != binScan[j]) { - printf("GPU scan[%d]:%u\tCPU scan[%d]:%u\n", j, - h_binHistogram[j], j, binHistogram[j]); - } - } - - // Gather everything into the correct location - // need to move vals and positions - for (unsigned int j = 0; j < numElems; ++j) { - unsigned int bin = (vals_src[j] & mask) >> i; - vals_dst[binScan[bin]] = vals_src[j]; - pos_dst[binScan[bin]] = pos_src[j]; - binScan[bin]++; - } - - // swap the buffers (pointers only) - std::swap(vals_dst, vals_src); - std::swap(pos_dst, pos_src); + printf("XXXXXXXXXX mask at pos %u XXXXXXXXXXX\n", i); + unsigned int mask = (numBins - 1) << i; + + memset(binHistogram, 0, + sizeof(unsigned int) * numBins); // zero out the bins + memset(binScan, 0, sizeof(unsigned int) * numBins); // zero out the + // bins + memset(binFlags, 0, memSize); + memset(binAddress, 0, memSize); + + // Reset GPU arrays + checkCudaErrors(cudaMemset(d_binHistogram, 0, g_memSize)); + checkCudaErrors(cudaMemset(d_binScan, 0, g_memSize)); + checkCudaErrors(cudaMemset(d_binFlags, 0, memSize)); + checkCudaErrors(cudaMemset(d_binAddress, 0, memSize)); + + // CPU histogram + // perform histogram of data & mask into bins + for (unsigned int j = 0; j < numElems; ++j) { + unsigned int bin = (vals_src[j] & mask) >> i; + binHistogram[bin]++; + } + + // GPU histogram + simple_hist<<>>(d_binHistogram, d_vals_src, + numElems, mask, i); + cudaDeviceSynchronize(); + checkCudaErrors(cudaMemcpy(h_binHistogram, d_binHistogram, g_memSize, + cudaMemcpyDeviceToHost)); + // check GPU results + for (unsigned int j = 0; j < numBins; j++) { + if (h_binHistogram[j] != binHistogram[j]) { + printf("GPU hist[%d]:%u\tCPU hist[%d]:%u\n", j, + h_binHistogram[j], j, binHistogram[j]); + } + } + + // CPU scan + // perform exclusive prefix sum (scan) on binHistogram to get starting + // location for each bin + for (unsigned int j = 1; j < numBins; ++j) { + binScan[j] = binScan[j - 1] + binHistogram[j - 1]; + } + + // GPU scan + naive_scan<<<1, numBins, numBins * sizeof(unsigned int)>>>( + d_binScan, d_binHistogram, numBins); + + cudaDeviceSynchronize(); + checkCudaErrors(cudaMemcpy(h_binScan, d_binScan, + numBins * sizeof(unsigned int), + cudaMemcpyDeviceToHost)); + // check GPU results + for (unsigned int j = 0; j < numBins; j++) { + if (h_binScan[j] != binScan[j]) { + printf("GPU scan[%d]:%u\tCPU scan[%d]:%u\n", j, + h_binHistogram[j], j, binHistogram[j]); + } + } + + // Gather everything into the correct location + // need to move vals and positions + for (unsigned int j = 0; j < numElems; ++j) { + unsigned int bin = (vals_src[j] & mask) >> i; + vals_dst[binScan[bin]] = vals_src[j]; + pos_dst[binScan[bin]] = pos_src[j]; + binScan[bin]++; + } + + // GPU computation to remap the elements + for (unsigned int j = 0; j < numBins; ++j) { + map_to_binFlags<<>>(d_binFlags, d_vals_src, + mask, i, j, numElems); + + cudaDeviceSynchronize(); + for (unsigned k = 0; k < numElems; k++) { + unsigned int bin = ((vals_src[k] & mask) >> i); + binFlags[k] = (bin == j)? 1 : 0; + } + + cudaMemcpy(h_binFlags, d_binFlags, memSize, cudaMemcpyDeviceToHost); + + for (unsigned k = 0; k < numElems; k++) { + unsigned int gpu = h_binFlags[k]; + unsigned int cpu = binFlags[k]; + + if (gpu != cpu) { + printf("gpu bin flag[%u]:%u\tcpu bin flag[%u]:%u\n", + k, gpu, k, cpu); + exit(1); + } + } + + multiBlockScan(d_binAddress, d_binFlags, numElems); + + binAddress[0] = 0; + for (unsigned k = 1; k < numElems; k++) { + binAddress[k] = binAddress[k - 1] + binFlags[k - 1]; + } + + cudaMemcpy(h_binAddress, d_binAddress, memSize, cudaMemcpyDeviceToHost); + + for (unsigned k = 0; k < numElems; k++) { + unsigned int gpu = h_binAddress[k]; + unsigned int cpu = binAddress[k]; + + if (gpu != cpu) { + printf("gpu bin addr[%u]:%u\tcpu bin addr[%u]:%u\n", + k, gpu, k, cpu); + exit(1); + } + } + + + offset_by_base<<>>(d_binAddress, d_binScan, + j, numElems); + + reorder<<>>(d_vals_dst, d_vals_src, + d_binFlags, d_binAddress, numElems); + + reorder<<>>(d_pos_dst, d_pos_src, + d_binFlags, d_binAddress, numElems); + + // reset d_binFlags after processing for the current + // bin. + memset(binFlags, 0, memSize); + memset(binAddress, 0, memSize); + checkCudaErrors(cudaMemset(d_binFlags, 0, memSize)); + checkCudaErrors(cudaMemset(d_binAddress, 0, memSize)); + } + + unsigned int *h_vals_dst = (unsigned int*)malloc(memSize); + checkCudaErrors(cudaMemcpy(h_vals_dst, d_vals_dst, + memSize, cudaMemcpyDeviceToHost)); + + for (size_t k = 0; k < numElems; k++) { + unsigned int gpu = h_vals_dst[k]; + unsigned int cpu = vals_dst[k]; + if (gpu != cpu) { + printf("gpu vals_dst[%lu]:%u\tcpu vals_dst[%lu]:%u\n", + k, gpu, k, cpu); + exit(1); + } + } + + // swap the buffers (pointers only) + std::swap(vals_dst, vals_src); + std::swap(pos_dst, pos_src); + + // swap device pointers + swap_device_ptr(&d_vals_dst, &d_vals_src); + swap_device_ptr(&d_pos_dst, &d_pos_src); } // we did an even number of iterations, need to copy from input buffer into // output std::copy(inputVals, inputVals + numElems, outputVals); // std::copy(inputPos, inputPos + numElems, outputPos); + checkCudaErrors(cudaMemcpy(d_outputPos, d_inputPos, memSize, cudaMemcpyDeviceToDevice)); + checkCudaErrors(cudaMemcpy(d_outputVals, d_inputVals, memSize, cudaMemcpyDeviceToDevice)); delete[] binHistogram; delete[] binScan; From 7c7e73e7d8bdf5ffb27989b6eab0508198ae4cad Mon Sep 17 00:00:00 2001 From: Qiongsi Wu Date: Mon, 13 Aug 2018 12:46:57 -0400 Subject: [PATCH 13/15] PS4 finishing up --- CMakeLists.txt | 2 +- Problem Sets/Problem Set 4/student_func.cu | 218 +-------------------- 2 files changed, 5 insertions(+), 215 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2627178a..e1681089 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,7 +21,7 @@ if(CUDA_FOUND) # to make sure more people can easily run class code without knowing # about this compiler argument set(CUDA_NVCC_FLAGS " - -ccbin /usr/bin/clang; + -ccbin /usr/bin/gcc; -gencode;arch=compute_30,code=sm_30; -gencode;arch=compute_35,code=sm_35; -gencode;arch=compute_35,code=compute_35;") diff --git a/Problem Sets/Problem Set 4/student_func.cu b/Problem Sets/Problem Set 4/student_func.cu index 2181c341..4ea5eaab 100755 --- a/Problem Sets/Problem Set 4/student_func.cu +++ b/Problem Sets/Problem Set 4/student_func.cu @@ -217,112 +217,11 @@ void swap_device_ptr(unsigned int **d1, unsigned int **d2) { void your_sort(unsigned int *const d_inputVals, unsigned int *const d_inputPos, unsigned int *const d_outputVals, unsigned int *const d_outputPos, const size_t numElems) { - // Temporary testing area... - printf("numElems %lu\n", numElems); - // Setting up data - size_t testSize = 512; - size_t testMemSize = testSize * sizeof(unsigned int); - unsigned int *h_test_in = (unsigned int *)malloc(testMemSize); - unsigned int *h_test_out = (unsigned int *)malloc(testMemSize); - for (unsigned int i = 0; i < testSize; i++) { - h_test_in[i] = 1; - } - // CPU reference result - h_test_out[0] = 0; - for (unsigned int i = 1; i < testSize; i++) { - h_test_out[i] = h_test_in[i - 1] + h_test_out[i - 1]; - } - - // GPU test - unsigned int *d_test_in; - unsigned int *d_test_out; - unsigned int *h_gpu_out; - checkCudaErrors(cudaMalloc((void **)&d_test_in, testMemSize)); - checkCudaErrors(cudaMalloc((void **)&d_test_out, testMemSize)); - h_gpu_out = (unsigned int *)malloc(testMemSize); - - checkCudaErrors( - cudaMemcpy(d_test_in, h_test_in, testMemSize, cudaMemcpyHostToDevice)); - - /*naive_scan<<<1, 8, 8 * sizeof(unsigned int)>>>(d_test_out, d_test_in, - testSize);*/ - - multiBlockScan(d_test_out, d_test_in, testSize); - - checkCudaErrors( - cudaMemcpy(h_gpu_out, d_test_out, testMemSize, cudaMemcpyDeviceToHost)); - - for (unsigned int i = 0; i < testSize; i++) { - unsigned int gpu = h_gpu_out[i]; - unsigned int cpu = h_test_out[i]; - if (gpu != cpu) { - printf("multiblock scan gpu[%u]:%u\t cpu[%u]%u\n", i, gpu, i, cpu); - } - } - - unsigned int *d_intermediate; - checkCudaErrors(cudaMalloc((void **)&d_intermediate, 4 * sizeof(unsigned int))); - //naive_inclusive_scan_per_block<<<4, 2, sizeof(unsigned int) * 2>>> - // (d_test_out, d_intermediate, d_test_in, testSize); - - // Testing reordering - /*map_to_binFlags<<>>(d_binFlags, d_vals_src, - mask, i, j, numElems); - - multiBlockScan(d_binAddress, d_binFlags, numElems); - - offset_by_base<<>>(d_binAddress, d_binScan, - j, numElems); - - reorder<<>>(d_vals_dst, d_vals_src, - d_binFlags, d_binAddress, numElems); - - reorder<<>>(d_pos_dst, d_pos_src, - d_binFlags, d_binAddress, numElems); - - // reset d_binFlags after processing for the current - // bin. - checkCudaErrors(cudaMemset(d_binFlags, 0, memSize)); - checkCudaErrors(cudaMemset(d_binAddress, 0, memSize)); */ - - // cleaning up - free(h_test_in); - free(h_test_out); - free(h_gpu_out); - checkCudaErrors(cudaFree(d_test_in)); - checkCudaErrors(cudaFree(d_test_out)); - - // end of testing area... - - const int numBits = 4; + const int numBits = 2; const int numBins = 1 << numBits; size_t memSize = sizeof(unsigned int) * numElems; - // CPU code for testing - unsigned int *binHistogram = new unsigned int[numBins]; - unsigned int *binScan = new unsigned int[numBins]; - - unsigned int *vals_src = new unsigned int[numElems]; - unsigned int *pos_src = new unsigned int[numElems]; - - unsigned int *vals_dst = new unsigned int[numElems]; - unsigned int *pos_dst = new unsigned int[numElems]; - - unsigned int *binFlags = new unsigned int[numElems]; - unsigned int *binAddress = new unsigned int[numElems]; - - checkCudaErrors( - cudaMemcpy(vals_src, d_inputVals, memSize, cudaMemcpyDeviceToHost)); - checkCudaErrors( - cudaMemcpy(pos_src, d_inputPos, memSize, cudaMemcpyDeviceToHost)); - - // Temporary CPU arrays - unsigned int *h_binHistogram = new unsigned int[numBins]; - unsigned int *h_binScan = new unsigned int[numBins]; - unsigned int *h_binFlags = new unsigned int[numElems]; - unsigned int *h_binAddress = new unsigned int[numElems]; - // GPU arrays unsigned int *d_binHistogram; unsigned int *d_binScan; @@ -351,120 +250,28 @@ void your_sort(unsigned int *const d_inputVals, unsigned int *const d_inputPos, // multiples of 2 // main loop for (unsigned int i = 0; i < 8 * sizeof(unsigned int); i += numBits) { - printf("XXXXXXXXXX mask at pos %u XXXXXXXXXXX\n", i); unsigned int mask = (numBins - 1) << i; - memset(binHistogram, 0, - sizeof(unsigned int) * numBins); // zero out the bins - memset(binScan, 0, sizeof(unsigned int) * numBins); // zero out the - // bins - memset(binFlags, 0, memSize); - memset(binAddress, 0, memSize); - // Reset GPU arrays checkCudaErrors(cudaMemset(d_binHistogram, 0, g_memSize)); checkCudaErrors(cudaMemset(d_binScan, 0, g_memSize)); checkCudaErrors(cudaMemset(d_binFlags, 0, memSize)); checkCudaErrors(cudaMemset(d_binAddress, 0, memSize)); - // CPU histogram - // perform histogram of data & mask into bins - for (unsigned int j = 0; j < numElems; ++j) { - unsigned int bin = (vals_src[j] & mask) >> i; - binHistogram[bin]++; - } - // GPU histogram simple_hist<<>>(d_binHistogram, d_vals_src, numElems, mask, i); - cudaDeviceSynchronize(); - checkCudaErrors(cudaMemcpy(h_binHistogram, d_binHistogram, g_memSize, - cudaMemcpyDeviceToHost)); - // check GPU results - for (unsigned int j = 0; j < numBins; j++) { - if (h_binHistogram[j] != binHistogram[j]) { - printf("GPU hist[%d]:%u\tCPU hist[%d]:%u\n", j, - h_binHistogram[j], j, binHistogram[j]); - } - } - - // CPU scan - // perform exclusive prefix sum (scan) on binHistogram to get starting - // location for each bin - for (unsigned int j = 1; j < numBins; ++j) { - binScan[j] = binScan[j - 1] + binHistogram[j - 1]; - } // GPU scan naive_scan<<<1, numBins, numBins * sizeof(unsigned int)>>>( d_binScan, d_binHistogram, numBins); - cudaDeviceSynchronize(); - checkCudaErrors(cudaMemcpy(h_binScan, d_binScan, - numBins * sizeof(unsigned int), - cudaMemcpyDeviceToHost)); - // check GPU results - for (unsigned int j = 0; j < numBins; j++) { - if (h_binScan[j] != binScan[j]) { - printf("GPU scan[%d]:%u\tCPU scan[%d]:%u\n", j, - h_binHistogram[j], j, binHistogram[j]); - } - } - - // Gather everything into the correct location - // need to move vals and positions - for (unsigned int j = 0; j < numElems; ++j) { - unsigned int bin = (vals_src[j] & mask) >> i; - vals_dst[binScan[bin]] = vals_src[j]; - pos_dst[binScan[bin]] = pos_src[j]; - binScan[bin]++; - } - // GPU computation to remap the elements for (unsigned int j = 0; j < numBins; ++j) { map_to_binFlags<<>>(d_binFlags, d_vals_src, mask, i, j, numElems); - - cudaDeviceSynchronize(); - for (unsigned k = 0; k < numElems; k++) { - unsigned int bin = ((vals_src[k] & mask) >> i); - binFlags[k] = (bin == j)? 1 : 0; - } - - cudaMemcpy(h_binFlags, d_binFlags, memSize, cudaMemcpyDeviceToHost); - - for (unsigned k = 0; k < numElems; k++) { - unsigned int gpu = h_binFlags[k]; - unsigned int cpu = binFlags[k]; - - if (gpu != cpu) { - printf("gpu bin flag[%u]:%u\tcpu bin flag[%u]:%u\n", - k, gpu, k, cpu); - exit(1); - } - } - multiBlockScan(d_binAddress, d_binFlags, numElems); - binAddress[0] = 0; - for (unsigned k = 1; k < numElems; k++) { - binAddress[k] = binAddress[k - 1] + binFlags[k - 1]; - } - - cudaMemcpy(h_binAddress, d_binAddress, memSize, cudaMemcpyDeviceToHost); - - for (unsigned k = 0; k < numElems; k++) { - unsigned int gpu = h_binAddress[k]; - unsigned int cpu = binAddress[k]; - - if (gpu != cpu) { - printf("gpu bin addr[%u]:%u\tcpu bin addr[%u]:%u\n", - k, gpu, k, cpu); - exit(1); - } - } - - offset_by_base<<>>(d_binAddress, d_binScan, j, numElems); @@ -476,8 +283,6 @@ void your_sort(unsigned int *const d_inputVals, unsigned int *const d_inputPos, // reset d_binFlags after processing for the current // bin. - memset(binFlags, 0, memSize); - memset(binAddress, 0, memSize); checkCudaErrors(cudaMemset(d_binFlags, 0, memSize)); checkCudaErrors(cudaMemset(d_binAddress, 0, memSize)); } @@ -486,31 +291,16 @@ void your_sort(unsigned int *const d_inputVals, unsigned int *const d_inputPos, checkCudaErrors(cudaMemcpy(h_vals_dst, d_vals_dst, memSize, cudaMemcpyDeviceToHost)); - for (size_t k = 0; k < numElems; k++) { - unsigned int gpu = h_vals_dst[k]; - unsigned int cpu = vals_dst[k]; - if (gpu != cpu) { - printf("gpu vals_dst[%lu]:%u\tcpu vals_dst[%lu]:%u\n", - k, gpu, k, cpu); - exit(1); - } - } - - // swap the buffers (pointers only) - std::swap(vals_dst, vals_src); - std::swap(pos_dst, pos_src); - // swap device pointers swap_device_ptr(&d_vals_dst, &d_vals_src); swap_device_ptr(&d_pos_dst, &d_pos_src); } // we did an even number of iterations, need to copy from input buffer into - // output std::copy(inputVals, inputVals + numElems, outputVals); - // std::copy(inputPos, inputPos + numElems, outputPos); + // output checkCudaErrors(cudaMemcpy(d_outputPos, d_inputPos, memSize, cudaMemcpyDeviceToDevice)); checkCudaErrors(cudaMemcpy(d_outputVals, d_inputVals, memSize, cudaMemcpyDeviceToDevice)); - delete[] binHistogram; - delete[] binScan; + checkCudaErrors(cudaFree(d_binHistogram)); + checkCudaErrors(cudaFree(d_binScan)); } From 331415b215b6e6179c651f1690ce5fa3459532a2 Mon Sep 17 00:00:00 2001 From: Qiongsi Wu Date: Mon, 13 Aug 2018 12:49:33 -0400 Subject: [PATCH 14/15] PS4 need to free some gpu memory --- Problem Sets/Problem Set 4/student_func.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Problem Sets/Problem Set 4/student_func.cu b/Problem Sets/Problem Set 4/student_func.cu index 4ea5eaab..33f3dc87 100755 --- a/Problem Sets/Problem Set 4/student_func.cu +++ b/Problem Sets/Problem Set 4/student_func.cu @@ -303,4 +303,6 @@ void your_sort(unsigned int *const d_inputVals, unsigned int *const d_inputPos, checkCudaErrors(cudaFree(d_binHistogram)); checkCudaErrors(cudaFree(d_binScan)); + checkCudaErrors(cudaFree(d_binFlags)); + checkCudaErrors(cudaFree(d_binAddress)); } From bdb02f3692caf480c8da2620089b03de13838af9 Mon Sep 17 00:00:00 2001 From: Qiongsi Wu Date: Mon, 13 Aug 2018 14:15:54 -0400 Subject: [PATCH 15/15] PS4 fixing potential barrier issue --- Problem Sets/Problem Set 4/student_func.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/Problem Sets/Problem Set 4/student_func.cu b/Problem Sets/Problem Set 4/student_func.cu index 33f3dc87..a479c82f 100755 --- a/Problem Sets/Problem Set 4/student_func.cu +++ b/Problem Sets/Problem Set 4/student_func.cu @@ -63,6 +63,7 @@ __global__ void naive_scan(unsigned int *const d_out, extern __shared__ unsigned int sdata[]; int tid = threadIdx.x; sdata[tid] = (tid >= 1 && tid < size) ? d_in[tid - 1] : 0; + __syncthreads(); for (unsigned int s = 1; s < blockDim.x; s <<= 1) { unsigned int a = 0;