diff --git a/ddprof-lib/src/main/cpp/arch.h b/ddprof-lib/src/main/cpp/arch.h index c1f16072a..e0b898382 100644 --- a/ddprof-lib/src/main/cpp/arch.h +++ b/ddprof-lib/src/main/cpp/arch.h @@ -19,6 +19,12 @@ #include +#ifdef _LP64 +# define LP64_ONLY(code) code +#else // !_LP64 +# define LP64_ONLY(code) +#endif // _LP64 + typedef unsigned char u8; typedef unsigned short u16; typedef unsigned int u32; diff --git a/ddprof-lib/src/main/cpp/callTraceStorage.cpp b/ddprof-lib/src/main/cpp/callTraceStorage.cpp index 7850623d3..e8c0dd128 100644 --- a/ddprof-lib/src/main/cpp/callTraceStorage.cpp +++ b/ddprof-lib/src/main/cpp/callTraceStorage.cpp @@ -19,6 +19,8 @@ #include "os.h" #include +#define COMMA , + static const u32 INITIAL_CAPACITY = 65536; static const u32 CALL_TRACE_CHUNK = 8 * 1024 * 1024; static const u32 OVERFLOW_TRACE_ID = 0x7fffffff; @@ -81,8 +83,7 @@ class LongHashTable { } }; -CallTrace CallTraceStorage::_overflow_trace = { - false, 1, {BCI_ERROR, (jmethodID) "storage_overflow"}}; +CallTrace CallTraceStorage::_overflow_trace = {false, 1, {BCI_ERROR, LP64_ONLY(0 COMMA) (jmethodID)"storage_overflow"}}; CallTraceStorage::CallTraceStorage() : _allocator(CALL_TRACE_CHUNK), _lock(0) { _current_table = LongHashTable::allocate(NULL, INITIAL_CAPACITY); diff --git a/ddprof-lib/src/main/cpp/livenessTracker.cpp b/ddprof-lib/src/main/cpp/livenessTracker.cpp index 9d465ad95..e06c09b2d 100644 --- a/ddprof-lib/src/main/cpp/livenessTracker.cpp +++ b/ddprof-lib/src/main/cpp/livenessTracker.cpp @@ -67,22 +67,14 @@ void LivenessTracker::cleanup_table(bool forced) { if (target != i) { _table[target] = _table[i]; // will clone TrackingEntry at 'i' _table[i].ref = nullptr; // will nullify the original ref - assert(_table[i].frames == _table[target].frames); - _table[i].frames = nullptr; // will nullify the original frames - assert(_table[target].frames != nullptr); + _table[i].call_trace_id = 0; } - assert(_table[target].ref != nullptr && - _table[target].frames != nullptr); _table[target].age += epoch_diff; } else { jweak tmpRef = _table[i].ref; _table[i].ref = nullptr; env->DeleteWeakGlobalRef(tmpRef); - - jvmtiFrameInfo *tmpFrames = _table[i].frames; - _table[i].frames = nullptr; - assert(_table[i].ref == nullptr && _table[i].frames == nullptr); - delete[] tmpFrames; + _table[i].call_trace_id = 0; } } @@ -119,8 +111,6 @@ void LivenessTracker::flush_table(std::set *tracked_thread_ids) { for (int i = 0; i < (sz = _table_size); i++) { jobject ref = env->NewLocalRef(_table[i].ref); if (ref != nullptr) { - assert(_table[i].frames != nullptr); - if (tracked_thread_ids != nullptr) { tracked_thread_ids->insert(_table[i].tid); } @@ -141,9 +131,7 @@ void LivenessTracker::flush_table(std::set *tracked_thread_ids) { : 0; env->ReleaseStringUTFChars(name_str, name); - Profiler::instance()->recordExternalSample( - 1, _table[i].tid, _table[i].frames, _table[i].frames_size, - /*truncated=*/false, BCI_LIVENESS, &event); + Profiler::instance()->recordDeferredSample(_table[i].tid, _table[i].call_trace_id, BCI_LIVENESS, &event); } env->DeleteLocalRef(ref); @@ -292,8 +280,7 @@ Error LivenessTracker::initialize(Arguments &args) { } void LivenessTracker::track(JNIEnv *env, AllocEvent &event, jint tid, - jobject object, int num_frames, - jvmtiFrameInfo *frames) { + jobject object, u32 call_trace_id) { if (!_enabled) { // disabled return; @@ -340,12 +327,7 @@ void LivenessTracker::track(JNIEnv *env, AllocEvent &event, jint tid, _table[idx].alloc = event; _table[idx].skipped = skipped; _table[idx].age = 0; - _table[idx].frames_size = num_frames; - _table[idx].frames = new jvmtiFrameInfo[_table[idx].frames_size]; - if (frames != nullptr) { - memcpy(_table[idx].frames, frames, - sizeof(jvmtiFrameInfo) * _table[idx].frames_size); - } + _table[idx].call_trace_id = call_trace_id; _table[idx].ctx = Contexts::get(tid); } diff --git a/ddprof-lib/src/main/cpp/livenessTracker.h b/ddprof-lib/src/main/cpp/livenessTracker.h index 101d8afae..78fdb46cf 100644 --- a/ddprof-lib/src/main/cpp/livenessTracker.h +++ b/ddprof-lib/src/main/cpp/livenessTracker.h @@ -32,8 +32,7 @@ typedef struct TrackingEntry { jweak ref; AllocEvent alloc; double skipped; - jint frames_size; - jvmtiFrameInfo *frames; + u32 call_trace_id; jint tid; jlong time; jlong age; @@ -100,8 +99,7 @@ class LivenessTracker { Error start(Arguments &args); void stop(); - void track(JNIEnv *env, AllocEvent &event, jint tid, jobject object, - int num_frames, jvmtiFrameInfo *frames); + void track(JNIEnv *env, AllocEvent &event, jint tid, jobject object, u32 call_trace_id); void flush(std::set &tracked_thread_ids); static void JNICALL GarbageCollectionFinish(jvmtiEnv *jvmti_env); diff --git a/ddprof-lib/src/main/cpp/objectSampler.cpp b/ddprof-lib/src/main/cpp/objectSampler.cpp index 1ed5ec5db..854c4fa94 100644 --- a/ddprof-lib/src/main/cpp/objectSampler.cpp +++ b/ddprof-lib/src/main/cpp/objectSampler.cpp @@ -67,9 +67,7 @@ void ObjectSampler::recordAllocation(jvmtiEnv *jvmti, JNIEnv *jni, event._id = id; } - jint frames_size = 0; - jvmtiFrameInfo *frames = nullptr; - + u32 call_trace_id = 0; // we do record the details and stacktraces only for when recording // allocations or liveness if (_record_allocations || _record_liveness) { @@ -78,31 +76,14 @@ void ObjectSampler::recordAllocation(jvmtiEnv *jvmti, JNIEnv *jni, ? 1 : 1 / (1 - exp(-size / (double)_interval))); - frames = new jvmtiFrameInfo[_max_stack_depth]; + call_trace_id = Profiler::instance()->recordJVMTISample(size, tid, thread, BCI_ALLOC, &event, !_record_allocations); - if (jvmti->GetStackTrace(thread, 0, _max_stack_depth, frames, - &frames_size) != JVMTI_ERROR_NONE || - frames_size <= 0) { - delete[] frames; + if (call_trace_id == 0) { return; } - - if (frames_size > 0) { - std::set classes; - jclass method_class; - for (int i = 0; i < frames_size; i++) { - if (jvmti->GetMethodDeclaringClass(frames[i].method, &method_class) == - 0) { - classes.insert(method_class); - } - } - } } if (_record_allocations) { - Profiler::instance()->recordExternalSample( - size, tid, frames, frames_size, /*truncated=*/false, BCI_ALLOC, &event); - u64 current_samples = __sync_add_and_fetch(&_alloc_event_count, 1); // in order to lower the number of atomic reads from the timestamp variable // the check will be performed only each N samples @@ -130,15 +111,10 @@ void ObjectSampler::recordAllocation(jvmtiEnv *jvmti, JNIEnv *jni, } // Either we are recording liveness or tracking GC generations (lightweight - // livenss samples) + // liveness samples) if (_gc_generations || _record_liveness) { - LivenessTracker::instance()->track(jni, event, tid, object, frames_size, - frames); + LivenessTracker::instance()->track(jni, event, tid, object, call_trace_id); } - - // it's safe to delete frames - the liveness tracker keeps a full copy of the - // frames and manages its own memory - delete[] frames; } Error ObjectSampler::check(Arguments &args) { diff --git a/ddprof-lib/src/main/cpp/profiler.cpp b/ddprof-lib/src/main/cpp/profiler.cpp index 8eb0ff52a..52f9370f6 100644 --- a/ddprof-lib/src/main/cpp/profiler.cpp +++ b/ddprof-lib/src/main/cpp/profiler.cpp @@ -17,6 +17,7 @@ #include "profiler.h" #include "asyncSampleMutex.h" +#include "common.h" #include "context.h" #include "counters.h" #include "ctimer.h" @@ -548,51 +549,6 @@ int Profiler::getJavaTraceAsync(void *ucontext, ASGCT_CallFrame *frames, return trace.frames - frames + 1; } -int Profiler::getJavaTraceJvmti(jvmtiFrameInfo *jvmti_frames, - ASGCT_CallFrame *frames, int start_depth, - int max_depth) { - int num_frames; - if (VM::jvmti()->GetStackTrace(NULL, start_depth, _max_stack_depth, - jvmti_frames, &num_frames) == 0 && - num_frames > 0) { - return convertFrames(jvmti_frames, frames, num_frames); - } - return 0; -} - -int Profiler::getJavaTraceInternal(jvmtiFrameInfo *jvmti_frames, - ASGCT_CallFrame *frames, int max_depth) { - // We cannot call pure JVM TI here, because it assumes _thread_in_native - // state, but allocation events happen in _thread_in_vm state, see - // https://github.com/jvm-profiling-tools/java-profiler/issues/64 - JNIEnv *jni = VM::jni(); - if (jni == NULL) { - return 0; - } - - JitWriteProtection jit(false); - VMThread *vm_thread = VMThread::fromEnv(jni); - int num_frames; - if (VMStructs::_get_stack_trace(NULL, vm_thread, 0, max_depth, jvmti_frames, - &num_frames) == 0 && - num_frames > 0) { - return convertFrames(jvmti_frames, frames, num_frames); - } - return 0; -} - -inline int Profiler::convertFrames(jvmtiFrameInfo *jvmti_frames, - ASGCT_CallFrame *frames, int num_frames) { - // Convert to AsyncGetCallTrace format. - // Note: jvmti_frames and frames may overlap. - for (int i = 0; i < num_frames; i++) { - jint bci = jvmti_frames[i].location; - frames[i].method_id = jvmti_frames[i].method; - frames[i].bci = bci; - } - return num_frames; -} - void Profiler::fillFrameTypes(ASGCT_CallFrame *frames, int num_frames, NMethod *nmethod) { if (nmethod->isNMethod() && nmethod->isAlive()) { @@ -634,10 +590,7 @@ void Profiler::fillFrameTypes(ASGCT_CallFrame *frames, int num_frames, } } -void Profiler::recordExternalSample(u64 counter, int tid, - jvmtiFrameInfo *jvmti_frames, - jint num_jvmti_frames, bool truncated, - jint event_type, Event *event) { +u32 Profiler::recordJVMTISample(u64 counter, int tid, jthread thread, jint event_type, Event *event, bool deferred) { atomicInc(_total_samples); u32 lock_index = getLockIndex(tid); @@ -647,29 +600,50 @@ void Profiler::recordExternalSample(u64 counter, int tid, // Too many concurrent signals already atomicInc(_failures[-ticks_skipped]); - if (event_type == BCI_CPU && _cpu_engine == &perf_events) { - // Need to reset PerfEvents ring buffer, even though we discard the - // collected trace - PerfEvents::resetBuffer(tid); - } - return; + return 0; } u32 call_trace_id = 0; - if (!_omit_stacktraces && jvmti_frames != nullptr) { + if (!_omit_stacktraces) { ASGCT_CallFrame *frames = _calltrace_buffer[lock_index]->_asgct_frames; + jvmtiFrameInfo *jvmti_frames = _calltrace_buffer[lock_index]->_jvmti_frames; int num_frames = 0; - if (!_jfr.active() && BCI_ALLOC >= event_type && event_type >= BCI_PARK && - event->_id) { - num_frames = makeFrame(frames, event_type, event->_id); + + if (VM::jvmti()->GetStackTrace(thread, 0, _max_stack_depth, jvmti_frames, &num_frames) == JVMTI_ERROR_NONE && num_frames > 0) { + // Convert to AsyncGetCallTrace format. + // Note: jvmti_frames and frames may overlap. + for (int i = 0; i < num_frames; i++) { + jint bci = jvmti_frames[i].location; + jmethodID mid = jvmti_frames[i].method; + frames[i].method_id = mid; + frames[i].bci = bci; + // see https://github.com/async-profiler/async-profiler/pull/1090 + LP64_ONLY(frames[i].padding = 0;) + } } - num_frames += - convertFrames(jvmti_frames, frames + num_frames, num_jvmti_frames); + call_trace_id = _call_trace_storage.put(num_frames, frames, false, counter); + } + if (!deferred) { + _jfr.recordEvent(lock_index, tid, call_trace_id, event_type, event); + } + + _locks[lock_index].unlock(); + return call_trace_id; +} - call_trace_id = - _call_trace_storage.put(num_frames, frames, truncated, counter); +void Profiler::recordDeferredSample(int tid, u32 call_trace_id, jint event_type, Event *event) { + atomicInc(_total_samples); + + u32 lock_index = getLockIndex(tid); + if (!_locks[lock_index].tryLock() && + !_locks[lock_index = (lock_index + 1) % CONCURRENCY_LEVEL].tryLock() && + !_locks[lock_index = (lock_index + 2) % CONCURRENCY_LEVEL].tryLock()) { + // Too many concurrent signals already + atomicInc(_failures[-ticks_skipped]); + return; } + _jfr.recordEvent(lock_index, tid, call_trace_id, event_type, event); _locks[lock_index].unlock(); @@ -1153,13 +1127,11 @@ Error Profiler::start(Arguments &args, bool reset) { // (Re-)allocate calltrace buffers if (_max_stack_depth != args._jstackdepth) { _max_stack_depth = args._jstackdepth; - size_t buffer_size = - (_max_stack_depth + MAX_NATIVE_FRAMES + RESERVED_FRAMES) * - sizeof(CallTraceBuffer); + size_t nelem = _max_stack_depth + MAX_NATIVE_FRAMES + RESERVED_FRAMES; for (int i = 0; i < CONCURRENCY_LEVEL; i++) { free(_calltrace_buffer[i]); - _calltrace_buffer[i] = (CallTraceBuffer *)malloc(buffer_size); + _calltrace_buffer[i] = (CallTraceBuffer*)calloc(nelem, sizeof(CallTraceBuffer)); if (_calltrace_buffer[i] == NULL) { _max_stack_depth = 0; return Error("Not enough memory to allocate stack trace buffers (try " diff --git a/ddprof-lib/src/main/cpp/profiler.h b/ddprof-lib/src/main/cpp/profiler.h index a28917c10..eeaf2ad94 100644 --- a/ddprof-lib/src/main/cpp/profiler.h +++ b/ddprof-lib/src/main/cpp/profiler.h @@ -52,8 +52,9 @@ const int RESERVED_FRAMES = 4; enum EventMask { EM_CPU = 1 << 0, EM_WALL = 1 << 1, EM_ALLOC = 1 << 2 }; -struct CallTraceBuffer { +union CallTraceBuffer { ASGCT_CallFrame _asgct_frames[1]; + jvmtiFrameInfo _jvmti_frames[1]; }; class FrameName; @@ -138,12 +139,6 @@ class Profiler { int tid, StackContext *java_ctx, bool *truncated); int getJavaTraceAsync(void *ucontext, ASGCT_CallFrame *frames, int max_depth, StackContext *java_ctx, bool *truncated); - int getJavaTraceJvmti(jvmtiFrameInfo *jvmti_frames, ASGCT_CallFrame *frames, - int start_depth, int max_depth); - int getJavaTraceInternal(jvmtiFrameInfo *jvmti_frames, - ASGCT_CallFrame *frames, int max_depth); - int convertFrames(jvmtiFrameInfo *jvmti_frames, ASGCT_CallFrame *frames, - int num_frames); void fillFrameTypes(ASGCT_CallFrame *frames, int num_frames, NMethod *nmethod); void updateThreadName(jvmtiEnv *jvmti, JNIEnv *jni, jthread thread, @@ -223,9 +218,8 @@ class Profiler { ASGCT_CallFrame *frames); void recordSample(void *ucontext, u64 weight, int tid, jint event_type, u32 call_trace_id, Event *event); - void recordExternalSample(u64 weight, int tid, jvmtiFrameInfo *jvmti_frames, - jint num_jvmti_frames, bool truncated, - jint event_type, Event *event); + u32 recordJVMTISample(u64 weight, int tid, jthread thread, jint event_type, Event *event, bool deferred); + void recordDeferredSample(int tid, u32 call_trace_id, jint event_type, Event *event); void recordExternalSample(u64 weight, int tid, int num_frames, ASGCT_CallFrame *frames, bool truncated, jint event_type, Event *event); diff --git a/ddprof-lib/src/main/cpp/vmEntry.h b/ddprof-lib/src/main/cpp/vmEntry.h index 14c4803ec..b30c3514f 100644 --- a/ddprof-lib/src/main/cpp/vmEntry.h +++ b/ddprof-lib/src/main/cpp/vmEntry.h @@ -20,6 +20,7 @@ #include +#include "arch.h" #include "codeCache.h" #include "frame.h" @@ -62,9 +63,12 @@ enum ASGCT_Failure { typedef struct { jint bci; + // see https://github.com/async-profiler/async-profiler/pull/1090 + LP64_ONLY(jint padding;) jmethodID method_id; } ASGCT_CallFrame; + typedef struct { JNIEnv *env; jint num_frames; diff --git a/ddprof-lib/src/main/cpp/wallClock.cpp b/ddprof-lib/src/main/cpp/wallClock.cpp index affdaeeee..fa93fe66d 100644 --- a/ddprof-lib/src/main/cpp/wallClock.cpp +++ b/ddprof-lib/src/main/cpp/wallClock.cpp @@ -192,18 +192,7 @@ void WallClockJVMTI::timerLoop() { auto sampleThreads = [&](ThreadEntry& thread_entry, int& num_failures, int& threads_already_exited, int& permission_denied) { static jint max_stack_depth = (jint)Profiler::instance()->max_stack_depth(); - static jvmtiFrameInfo* frame_buffer = new jvmtiFrameInfo[max_stack_depth]; - static jvmtiEnv* jvmti = VM::jvmti(); - int num_frames = 0; - jvmtiError err = jvmti->GetStackTrace(thread_entry.java, 0, max_stack_depth, frame_buffer, &num_frames); - if (err != JVMTI_ERROR_NONE) { - num_failures++; - if (err == JVMTI_ERROR_THREAD_NOT_ALIVE) { - threads_already_exited++; - } - return false; - } ExecutionEvent event; VMThread* vm_thread = thread_entry.native; int raw_thread_state = vm_thread->state(); @@ -225,7 +214,7 @@ void WallClockJVMTI::timerLoop() { event._execution_mode = mode; event._weight = 1; - Profiler::instance()->recordExternalSample(1, thread_entry.native->osThreadId(), frame_buffer, num_frames, false, BCI_WALL, &event); + Profiler::instance()->recordJVMTISample(1, thread_entry.native->osThreadId(), thread_entry.java, BCI_WALL, &event, false); return true; };