CIS565-Fall-2014 · dblsai · Sep 29, 2014 · Sep 29, 2014 · Sep 29, 2014 · Sep 29, 2014
diff --git a/README.md b/README.md
@@ -88,6 +88,12 @@ you are NOT allowed to use shared memory.
   include a table of how the runtimes compare on different lengths of arrays.
 * Plot a graph of the comparison and write a short explanation of the phenomenon you
   see here.
+* I don't have a high resolution CPU clock in visual studio, though I tried many ways 
+  to find one, like "chrono" which doesn't support VS2010. Anyway, I couldn't get 
+CPU timer readings less than 1 ms resolution. 
+But what I observe for GPU version, when array size goes larger, computation time
+increases. And I think same applies to CPU version. Not sure which one performs better.
+
 
 # PART 3 : OPTIMIZING PREFIX SUM
 In the previous section we did not take into account shared memory.  In the
@@ -106,6 +112,13 @@ to arbitrary length arrays, this includes arrays that will not fit on one block.
 * Compare this version to the parallel prefix sum using global memory.
 * Plot a graph of the comparison and write a short explanation of the phenomenon
   you see here.
+![alt tag](/imgs/SM_G_fixedN.png)
+When Array size is fixed, speed increases when Block size is larger. Big Block size 
+is not always good, performance decrease after 128. 
+
+![alt tag](/imgs/SM_G_fixedB.png)
+When Block size is fixed, speed decrease when array size is larger. And Shared memory
+version alway does a better job than Global memory version.
 
 # PART 4 : ADDING SCATTER
 First create a serial version of scatter by expanding the serial version of
@@ -117,6 +130,9 @@ array for you.  Finally, write a version using thrust.
 * Compare your version of stream compact to your version using thrust.  How do
   they compare?  How might you optimize yours more, or how might thrust's stream
   compact be optimized.
+![alt tag](/imgs/T_SM_fixedB.png)
+Thrust always performs in a constant speed, no matter how big the array size.
+Things to optimize in my version:  work-efficient code + handling bank collisions.
 
 # EXTRA CREDIT (+10)
 For extra credit, please optimize your prefix sum for work parallelism and to

diff --git a/StreamCompaction/Debug/StreamCompaction.ilk b/StreamCompaction/Debug/StreamCompaction.ilk
diff --git a/StreamCompaction/Debug/StreamCompaction.pdb b/StreamCompaction/Debug/StreamCompaction.pdb
diff --git a/StreamCompaction/StreamCompaction.sdf b/StreamCompaction/StreamCompaction.sdf
diff --git a/StreamCompaction/StreamCompaction.sln b/StreamCompaction/StreamCompaction.sln
@@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "StreamCompaction", "StreamCompaction\StreamCompaction.vcxproj", "{F7A0F875-4831-4262-BE9F-038D944DB786}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{F7A0F875-4831-4262-BE9F-038D944DB786}.Debug|Win32.ActiveCfg = Debug|Win32
+		{F7A0F875-4831-4262-BE9F-038D944DB786}.Debug|Win32.Build.0 = Debug|Win32
+		{F7A0F875-4831-4262-BE9F-038D944DB786}.Release|Win32.ActiveCfg = Release|Win32
+		{F7A0F875-4831-4262-BE9F-038D944DB786}.Release|Win32.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/StreamCompaction/StreamCompaction.suo b/StreamCompaction/StreamCompaction.suo
diff --git a/StreamCompaction/StreamCompaction.v12.suo b/StreamCompaction/StreamCompaction.v12.suo
diff --git a/StreamCompaction/StreamCompaction/CPUSequential.txt b/StreamCompaction/StreamCompaction/CPUSequential.txt
@@ -0,0 +1,153 @@
+-------------------CPUSequential.txt-------------------
+------------------- Output array -------------------
+1
+3
+5
+7
+9
+11
+13
+15
+17
+19
+21
+23
+25
+27
+29
+31
+33
+35
+37
+39
+41
+43
+45
+47
+49
+51
+53
+55
+57
+59
+61
+63
+65
+67
+69
+71
+73
+75
+77
+79
+81
+83
+85
+87
+89
+91
+93
+95
+97
+99
+101
+103
+105
+107
+109
+111
+113
+115
+117
+119
+121
+123
+125
+127
+129
+131
+133
+135
+137
+139
+141
+143
+145
+147
+149
+151
+153
+155
+157
+159
+161
+163
+165
+167
+169
+171
+173
+175
+177
+179
+181
+183
+185
+187
+189
+191
+193
+195
+197
+199
+201
+203
+205
+207
+209
+211
+213
+215
+217
+219
+221
+223
+225
+227
+229
+231
+233
+235
+237
+239
+241
+243
+245
+247
+249
+251
+253
+255
+257
+259
+261
+263
+265
+267
+269
+271
+273
+275
+277
+279
+281
+283
+285
+287
+289
+291
+293
+295
+297
+299
+
diff --git a/StreamCompaction/StreamCompaction/Debug/CL.read.1.tlog b/StreamCompaction/StreamCompaction/Debug/CL.read.1.tlog
diff --git a/StreamCompaction/StreamCompaction/Debug/CL.write.1.tlog b/StreamCompaction/StreamCompaction/Debug/CL.write.1.tlog
diff --git a/StreamCompaction/StreamCompaction/Debug/ParallelReduction.cu.cache b/StreamCompaction/StreamCompaction/Debug/ParallelReduction.cu.cache
@@ -0,0 +1,49 @@
+Identity=ParallelReduction.cu
+AdditionalCompilerOptions=
+AdditionalCompilerOptions=
+AdditionalDependencies=
+AdditionalDeps=
+AdditionalLibraryDirectories=
+AdditionalOptions=
+AdditionalOptions=
+CInterleavedPTX=false
+CodeGeneration=compute_20,sm_20
+CodeGeneration=compute_20,sm_20
+CompileOut=Debug\ParallelReduction.cu.obj
+CudaRuntime=Static
+CudaToolkitCustomDir=
+Defines=;WIN32;_DEBUG;_CONSOLE;_UNICODE;UNICODE;
+Emulation=false
+FastMath=false
+GenerateLineInfo=false
+GenerateRelocatableDeviceCode=false
+GPUDebugInfo=true
+GPUDebugInfo=true
+HostDebugInfo=true
+Include=;;Q:\CUDA5.5\include
+Inputs=
+Keep=false
+KeepDir=Debug
+LinkOut=
+MaxRegCount=0
+NvccCompilation=compile
+NvccPath=
+Optimization=Od
+Optimization=Od
+PerformDeviceLink=
+PtxAsOptionV=false
+RequiredIncludes=
+Runtime=MDd
+Runtime=MDd
+RuntimeChecks=RTC1
+RuntimeChecks=RTC1
+TargetMachinePlatform=32
+TargetMachinePlatform=32
+TypeInfo=
+TypeInfo=
+UseHostDefines=true
+UseHostInclude=true
+UseHostLibraryDependencies=
+UseHostLibraryDirectories=
+Warning=W3
+Warning=W3
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,153 @@
		-------------------CPUSequential.txt-------------------
		------------------- Output array -------------------
		1
		3
		5
		7
		9
		11
		13
		15
		17
		19
		21
		23
		25
		27
		29
		31
		33
		35
		37
		39
		41
		43
		45
		47
		49
		51
		53
		55
		57
		59
		61
		63
		65
		67
		69
		71
		73
		75
		77
		79
		81
		83
		85
		87
		89
		91
		93
		95
		97
		99
		101
		103
		105
		107
		109
		111
		113
		115
		117
		119
		121
		123
		125
		127
		129
		131
		133
		135
		137
		139
		141
		143
		145
		147
		149
		151
		153
		155
		157
		159
		161
		163
		165
		167
		169
		171
		173
		175
		177
		179
		181
		183
		185
		187
		189
		191
		193
		195
		197
		199
		201
		203
		205
		207
		209
		211
		213
		215
		217
		219
		221
		223
		225
		227
		229
		231
		233
		235
		237
		239
		241
		243
		245
		247
		249
		251
		253
		255
		257
		259
		261
		263
		265
		267
		269
		271
		273
		275
		277
		279
		281
		283
		285
		287
		289
		291
		293
		295
		297
		299