Cleaned up debugging messages

chewxy · chewxy · commit 3ad528ef3cbc · 2017-05-19T16:01:36.000+10:00
Added Run() method for batched context for managed runs
Added a final Synchronize() call to CUContext.Unlock()
Added checks to ensure BatchedContext and Ctx are Contexts
Fixed tests to conform to new paradigm
diff --git a/README.md b/README.md
@@ -46,7 +46,8 @@ The work to fully represent the CUDA Driver API is a work in progress. At the mo
 
 ## Roadmap ##
 
-* [ ] All texture, surface and graphics related API have an equivalent Go prototype.
+* [ ] Remaining API to be ported over
+* [x] All texture, surface and graphics related API have an equivalent Go prototype.
 * [x] Batching of common operations (see for example `Device.Attributes(...)`
 * [x] Generic queueing/batching of API calls (by some definition of generic)
 
diff --git a/api.go b/api.go
@@ -224,7 +224,7 @@ func MemFreeHost(p unsafe.Pointer) (err error) {
 	return result(C.cuMemFreeHost(Cp))
 }
 
-func MemAllocManaged(bytesize int64, flags uint) (dptr DevicePtr, err error) {
+func MemAllocManaged(bytesize int64, flags MemAttachFlags) (dptr DevicePtr, err error) {
 	Cbytesize := C.size_t(bytesize)
 	Cflags := C.uint(flags)
 	var Cdptr C.CUdeviceptr
diff --git a/batch.go b/batch.go
@@ -195,8 +195,6 @@ func (ctx *BatchedContext) WorkAvailable() <-chan struct{} { return ctx.workAvai
 // DoWork waits for work to come in from the queue. If it's blocking, the entire queue will be processed immediately.
 // Otherwise it will be added to the batch queue.
 func (ctx *BatchedContext) DoWork() {
-	// ctx.Lock()
-	// defer ctx.Unlock()
 	for {
 		select {
 		case w := <-ctx.work:
@@ -225,9 +223,7 @@ func (ctx *BatchedContext) DoWork() {
 		}
 
 		// debug and instrumentation related stuff
-		logf("GOING TO PROCESS")
-		pc, _, _, _ := runtime.Caller(1)
-		logf("Called by %v", runtime.FuncForPC(pc).Name())
+		logCaller("DoWork()")
 		logf(ctx.introspect())
 		addQueueLength(len(ctx.queue))
 		addBlockingCallers()
@@ -267,6 +263,29 @@ func (ctx *BatchedContext) DoWork() {
 	}
 }
 
+// Run manages the running of the BatchedContext. Because it's expected to run in a goroutine, an error channel is to be passed in
+func (ctx *BatchedContext) Run(errChan chan error) error {
+	runtime.LockOSThread()
+	for {
+		select {
+		case <-ctx.workAvailable:
+			ctx.DoWork()
+			if err := ctx.Errors(); err != nil {
+				if errChan == nil {
+					runtime.UnlockOSThread()
+					return err
+				}
+				errChan <- err
+
+			}
+		case w := <-ctx.Work():
+			ctx.ErrChan() <- w()
+		}
+	}
+	runtime.UnlockOSThread()
+	return nil
+}
+
 // Cleanup is the cleanup function. It cleans up all the ancilliary allocations that has happened for all the batched calls.
 // This method should be called when the context is done with - otherwise there'd be a lot of leaked memory.
 //
@@ -323,9 +342,6 @@ func (ctx *BatchedContext) MemAllocManaged(bytesize int64, flags MemAttachFlags)
 }
 
 func (ctx *BatchedContext) Memcpy(dst, src DevicePtr, byteCount int64) {
-	// pc, _, _, _ := runtime.Caller(1)
-	// logf("Memcpy %v %v| called by %v", dst, src, runtime.FuncForPC(pc).Name())
-
 	fn := &fnargs{
 		fn:      C.fn_memcpy,
 		devptr0: C.CUdeviceptr(dst),
@@ -337,8 +353,6 @@ func (ctx *BatchedContext) Memcpy(dst, src DevicePtr, byteCount int64) {
 }
 
 func (ctx *BatchedContext) MemcpyHtoD(dst DevicePtr, src unsafe.Pointer, byteCount int64) {
-	// logf("Memcpy H2D: 0x%v, %v", dst, src)
-	// log.Printf("Memcpy H2D: 0x%v, %v", dst, src)
 	fn := &fnargs{
 		fn:      C.fn_memcpyHtoD,
 		devptr0: C.CUdeviceptr(dst),
@@ -350,9 +364,6 @@ func (ctx *BatchedContext) MemcpyHtoD(dst DevicePtr, src unsafe.Pointer, byteCou
 }
 
 func (ctx *BatchedContext) MemcpyDtoH(dst unsafe.Pointer, src DevicePtr, byteCount int64) {
-	// pc, _, _, _ := runtime.Caller(2)
-	// log.Printf("MemcpyD2H %v %v| called by %v", dst, src, runtime.FuncForPC(pc).Name())
-	// logf("Memcpy D2H: %v 0x%v", dst, src)
 	fn := &fnargs{
 		fn:      C.fn_memcpyDtoH,
 		devptr0: C.CUdeviceptr(src),
@@ -364,8 +375,6 @@ func (ctx *BatchedContext) MemcpyDtoH(dst unsafe.Pointer, src DevicePtr, byteCou
 }
 
 func (ctx *BatchedContext) MemFree(mem DevicePtr) {
-	// pc, _, _, _ := runtime.Caller(1)
-	// logf("MEMFREE  %v CALLED BY %v", mem, runtime.FuncForPC(pc).Name())
 	fn := &fnargs{
 		fn:      C.fn_memfreeD,
 		devptr0: C.CUdeviceptr(mem),
diff --git a/batch_test.go b/batch_test.go
@@ -81,9 +81,11 @@ loop:
 			bctx.DoWork()
 		case <-doneChan:
 			break loop
-		default:
 		}
 	}
+	if err = Synchronize(); err != nil {
+		t.Errorf("Failed to Sync %v", err)
+	}
 
 	for _, v := range a {
 		if v != float32(2) {
@@ -93,7 +95,7 @@ loop:
 	}
 
 	Unload(mod)
-	// DestroyContext(&ctx)
+	DestroyContext(&cuctx)
 }
 
 func TestLargeBatch(t *testing.T) {
@@ -138,9 +140,9 @@ func TestLargeBatch(t *testing.T) {
 	}
 	size := int64(len(a) * 4)
 
-	var frees []DevicePtr
 	go func() {
 		var memA, memB DevicePtr
+		var frees []DevicePtr
 
 		for i := 0; i < 104729; i++ {
 			if memA, err = bctx.AllocAndCopy(unsafe.Pointer(&a[0]), size); err != nil {
@@ -173,6 +175,7 @@ func TestLargeBatch(t *testing.T) {
 
 		bctx.MemcpyDtoH(unsafe.Pointer(&a[0]), memA, size)
 		bctx.MemcpyDtoH(unsafe.Pointer(&b[0]), memB, size)
+		log.Printf("Number of frees %v", len(frees))
 		for _, free := range frees {
 			bctx.MemFree(free)
 		}
@@ -191,6 +194,11 @@ loop:
 		}
 	}
 
+	bctx.DoWork()
+	if err = Synchronize(); err != nil {
+		t.Errorf("Failed to Sync %v", err)
+	}
+
 	for _, v := range a {
 		if v != float32(2) {
 			t.Errorf("Expected all values to be 2. %v", a)
@@ -201,12 +209,10 @@ loop:
 	afterFree, _, _ := MemInfo()
 
 	if afterFree != beforeFree {
-		t.Errorf("Before: Freemem: %v. After %v", beforeFree, afterFree)
+		t.Errorf("Before: Freemem: %v. After %v | Diff %v", beforeFree, afterFree, (beforeFree-afterFree)/1024)
 	}
-
 	Unload(mod)
-	// DestroyContext(&ctx)
-
+	DestroyContext(&cuctx)
 }
 
 func BenchmarkNoBatching(bench *testing.B) {
@@ -359,6 +365,6 @@ func BenchmarkBatching(bench *testing.B) {
 	MemFree(memA)
 	MemFree(memB)
 	Unload(mod)
-	// DestroyContext(&ctx)
+	DestroyContext(&cuctx)
 
 }
diff --git a/cmd/genlib/README.md b/cmd/genlib/README.md
@@ -50,5 +50,8 @@ The first line preprocesses all the macros, leaving a singular header file. The
 * `CurrentContext` - deleted
 * `CurrentDevice` 
 * `CurrentFlags`
+* `CanAccessPeer` - deleted
+* `P2PAttribute` - deleted
+* `MemAllocManaged`
 
 ## Ctx related methods - manually written ##
diff --git a/context.go b/context.go
@@ -2,13 +2,12 @@ package cu
 
 // #include <cuda.h>
 import "C"
-import (
-	"sync"
-	"unsafe"
-)
+import "unsafe"
 
-var contextLock = new(sync.Mutex)
-var pkgContext CUContext
+var (
+	_ Context = &Ctx{}
+	_ Context = &BatchedContext{}
+)
 
 // Context interface. Typically you'd just embed *Ctx. Rarely do you need to use CUContext
 type Context interface {
@@ -17,15 +16,15 @@ type Context interface {
 	Error() error
 	Run(chan error) error
 	Do(fn func() error) error
-	Work() chan func() error
+	Work() <-chan func() error
+	ErrChan() chan<- error
 
 	// actual methods
 	Address(hTexRef TexRef) (pdptr DevicePtr, err error)
 	AddressMode(hTexRef TexRef, dim int) (pam AddressMode, err error)
 	Array(hTexRef TexRef) (phArray Array, err error)
 	AttachMemAsync(hStream Stream, dptr DevicePtr, length int64, flags uint)
 	BorderColor(hTexRef TexRef) (pBorderColor [3]float32, err error)
-	CanAccessPeer(dev Device, peerDev Device) (canAccessPeer int, err error)
 	CurrentCacheConfig() (pconfig FuncCacheConfig, err error)
 	CurrentDevice() (device Device, err error)
 	CurrentFlags() (flags ContextFlags, err error)
@@ -49,7 +48,7 @@ type Context interface {
 	MakeStreamWithPriority(priority int, flags StreamFlags) (stream Stream, err error)
 	MaxAnisotropy(hTexRef TexRef) (pmaxAniso int, err error)
 	MemAlloc(bytesize int64) (dptr DevicePtr, err error)
-	MemAllocManaged(bytesize int64, flags uint) (dptr DevicePtr, err error)
+	MemAllocManaged(bytesize int64, flags MemAttachFlags) (dptr DevicePtr, err error)
 	MemAllocPitch(WidthInBytes int64, Height int64, ElementSizeBytes uint) (dptr DevicePtr, pPitch int64, err error)
 	MemFree(dptr DevicePtr)
 	MemFreeHost(p unsafe.Pointer)
@@ -92,7 +91,6 @@ type Context interface {
 	MemsetD8Async(dstDevice DevicePtr, uc byte, N int64, hStream Stream)
 	ModuleFunction(m Module, name string) (function Function, err error)
 	ModuleGlobal(m Module, name string) (dptr DevicePtr, size int64, err error)
-	P2PAttribute(srcDevice Device, attrib P2PAttribute, dstDevice Device) (value int, err error)
 	Priority(hStream Stream) (priority int, err error)
 	QueryEvent(hEvent Event)
 	QueryStream(hStream Stream)
diff --git a/ctx.go b/ctx.go
@@ -5,7 +5,6 @@ package cu
 // #include <cuda.h>
 import "C"
 import (
-	"log"
 	"runtime"
 	"unsafe"
 )
@@ -41,9 +40,6 @@ func newContext(c CUContext) *Ctx {
 		work:      make(chan func() error),
 		errChan:   make(chan error),
 	}
-	pc, _, _, _ := runtime.Caller(2)
-
-	log.Printf("Created %p by %v", ctx, runtime.FuncForPC(pc).Name())
 	runtime.SetFinalizer(ctx, finalizeCtx)
 	return ctx
 
@@ -61,7 +57,10 @@ func (ctx *Ctx) CUDAContext() CUContext { return ctx.CUContext }
 func (ctx *Ctx) Error() error { return ctx.err }
 
 // Work returns the channel where work will be passed in. In most cases you don't need this. Use Run instead.
-func (ctx *Ctx) Work() chan func() error { return ctx.work }
+func (ctx *Ctx) Work() <-chan func() error { return ctx.work }
+
+// ErrChan returns the internal error channel used
+func (ctx *Ctx) ErrChan() chan<- error { return ctx.errChan }
 
 // Run locks the goroutine to the OS thread and ties the CUDA context to the OS thread. For most cases, this would suffice
 //
@@ -115,7 +114,6 @@ func (ctx *Ctx) Run(errChan chan error) error {
 }
 
 func finalizeCtx(ctx *Ctx) {
-	log.Printf("Finalizing %p", ctx)
 	if ctx.CUContext == 0 {
 		close(ctx.errChan)
 		close(ctx.work)
diff --git a/ctx_api.go b/ctx_api.go
@@ -169,7 +169,7 @@ func (ctx *Ctx) MemFreeHost(p unsafe.Pointer) {
 	ctx.err = ctx.Do(f)
 }
 
-func (ctx *Ctx) MemAllocManaged(bytesize int64, flags uint) (dptr DevicePtr, err error) {
+func (ctx *Ctx) MemAllocManaged(bytesize int64, flags MemAttachFlags) (dptr DevicePtr, err error) {
 	Cbytesize := C.size_t(bytesize)
 	Cflags := C.uint(flags)
 	var Cdptr C.CUdeviceptr
@@ -1022,21 +1022,6 @@ func (ctx *Ctx) CanAccessPeer(dev Device, peerDev Device) (canAccessPeer int, er
 	return
 }
 
-func (ctx *Ctx) P2PAttribute(srcDevice Device, attrib P2PAttribute, dstDevice Device) (value int, err error) {
-	CsrcDevice := C.CUdevice(srcDevice)
-	Cattrib := C.CUdevice_P2PAttribute(attrib)
-	CdstDevice := C.CUdevice(dstDevice)
-	var Cvalue C.int
-	f := func() error {
-		return result(C.cuDeviceGetP2PAttribute(&Cvalue, Cattrib, CsrcDevice, CdstDevice))
-	}
-	if err = ctx.Do(f); err != nil {
-		err = errors.Wrap(err, "P2PAttribute")
-	}
-	value = int(Cvalue)
-	return
-}
-
 func (ctx *Ctx) EnablePeerAccess(peerContext CUContext, Flags uint) {
 	CpeerContext := peerContext.c()
 	CFlags := C.uint(Flags)
diff --git a/cucontext.go b/cucontext.go
@@ -4,6 +4,7 @@ package cu
 import "C"
 import (
 	"fmt"
+	"runtime"
 	"unsafe"
 )
 
@@ -27,6 +28,44 @@ func (d Device) MakeContext(flags ContextFlags) (CUContext, error) {
 	return makeContext(ctx), nil
 }
 
+// Lock ties the calling goroutine to an OS thread, then ties the CUDA context to the thread.
+// Do not call in a goroutine.
+//
+// Good:
+/*
+	func main() {
+		dev, _ := GetDevice(0)
+		ctx, _ := dev.MakeContext()
+		if err := ctx.Lock(); err != nil{
+			// handle error
+		}
+
+		mem, _ := MemAlloc(1024)
+	}
+*/
+// Bad:
+/*
+	func main() {
+		dev, _ := GetDevice(0)
+		ctx, _ := dev.MakeContext()
+		go ctx.Lock() // this will tie the goroutine that calls ctx.Lock to the OS thread, while the main thread does not get the lock
+		mem, _ := MemAlloc(1024)
+	}
+*/
+func (ctx CUContext) Lock() error {
+	runtime.LockOSThread()
+	return SetCurrentContext(ctx)
+}
+
+// Unlock unlocks unbinds the goroutine from the OS thread
+func (ctx CUContext) Unlock() error {
+	if err := Synchronize(); err != nil {
+		return err
+	}
+	runtime.UnlockOSThread()
+	return nil
+}
+
 // DestroyContext destroys the context. It returns an error if it wasn't properly destroyed
 //
 // Wrapper over cuCtxDestroy: http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g27a365aebb0eb548166309f58a1e8b8e
diff --git a/cucontext_test.go b/cucontext_test.go
diff --git a/debug.go b/debug.go
diff --git a/instrumentation.go b/instrumentation.go
diff --git a/release.go b/release.go

Original file line number	Diff line number	Diff line change
`@@ -224,7 +224,7 @@ func MemFreeHost(p unsafe.Pointer) (err error) {`
`224`	`224`	`return result(C.cuMemFreeHost(Cp))`
`225`	`225`	`}`
`226`	`226`
`227`		`-func MemAllocManaged(bytesize int64, flags uint) (dptr DevicePtr, err error) {`
	`227`	`+func MemAllocManaged(bytesize int64, flags MemAttachFlags) (dptr DevicePtr, err error) {`
`228`	`228`	`Cbytesize := C.size_t(bytesize)`
`229`	`229`	`Cflags := C.uint(flags)`
`230`	`230`	`var Cdptr C.CUdeviceptr`