diff --git a/server/src/instant/gauges.clj b/server/src/instant/gauges.clj index 68285ad32..6eed6626d 100644 --- a/server/src/instant/gauges.clj +++ b/server/src/instant/gauges.clj @@ -2,12 +2,14 @@ "Repeatedly saves metric information in `tracer/last-calculated-metrics`." (:require [clojure.string :as str] - [instant.util.tracer :as tracer] - [instant.util.delay :as delay]) + [instant.config :as config] + [instant.util.delay :as delay] + [instant.util.tracer :as tracer]) (:import - (java.lang.management ManagementFactory GarbageCollectorMXBean) - (javax.management NotificationEmitter NotificationListener) (com.sun.management GarbageCollectionNotificationInfo) + (java.lang.management GarbageCollectorMXBean ManagementFactory ThreadInfo) + (java.util.concurrent ScheduledFuture ThreadPoolExecutor) + (javax.management NotificationEmitter NotificationListener) (javax.management.openmbean CompositeData))) (defonce gauge-metric-fns (atom {})) @@ -71,13 +73,15 @@ :value (-> thread .getThreadCount)} {:path "jvm.thread.daemon.count" :value (-> thread .getDaemonThreadCount)} - (for [thread-state (Thread$State/values)] - {:path (str "jvm.thread." (-> thread-state str str/lower-case) ".count") - :value (count - (filter #(and % (= thread-state (.getThreadState %))) - (.getThreadInfo thread - (-> thread .getAllThreadIds))))}) - (for [[executor description] + (let [thread-groups (group-by (fn [^ThreadInfo thread-info] + (when thread-info + (.getThreadState thread-info))) + (.getThreadInfo thread + (.getAllThreadIds thread)))] + (for [thread-state (Thread$State/values)] + {:path (str "jvm.thread." (-> thread-state str str/lower-case) ".count") + :value (count (get thread-groups thread-state))})) + (for [[^ThreadPoolExecutor executor description] [[solo-executor "agent-pool.send-off"] [pooled-executor "agent-pool.send"]]] [{:path (str "jvm." description ".queue-depth") @@ -109,7 +113,12 @@ (defn straight-jacket-record-gauges [] (try - (reset! tracer/last-calculated-metrics (gauges)) + (if (= :prod (config/get-env)) + (tracer/record-info! {:name "gauges" + :attributes (gauges)}) + ;; Run them in dev so any issues will bubble up, but don't bother + ;; sending anything to honeycomb + (gauges)) (catch Throwable t (tracer/record-exception-span! t {:name "gauges/straight-jacket-record-gauges"})))) @@ -147,13 +156,13 @@ (defn start [] (tracer/record-info! {:name "gauges/start"}) (def record-job (delay/repeat-fn delay-pool - 1000 + 30000 ;; 30 seconds #'straight-jacket-record-gauges)) (def cleanup-gc-listeners (add-gc-listeners))) (defn stop [] (when (bound? #'record-job) - (.cancel record-job true)) + (.cancel ^ScheduledFuture record-job true)) (when (bound? #'cleanup-gc-listeners) (cleanup-gc-listeners))) diff --git a/server/src/instant/util/logging_exporter.clj b/server/src/instant/util/logging_exporter.clj index 382b658c9..7d993b47b 100644 --- a/server/src/instant/util/logging_exporter.clj +++ b/server/src/instant/util/logging_exporter.clj @@ -1,18 +1,19 @@ (ns instant.util.logging-exporter "Exporter that logs spans to stdout. In development we also colorize the logs" (:require - [clojure.tools.logging :as log] [clojure.string :as string] + [clojure.tools.logging :as log] [instant.config :as config]) - (:import (io.opentelemetry.api.common AttributeKey) - (io.opentelemetry.api.trace Span) - (io.opentelemetry.sdk.common CompletableResultCode) - (io.opentelemetry.sdk.trace.export SpanExporter) - (java.util.concurrent TimeUnit) - (java.util.concurrent.atomic AtomicBoolean))) + (:import + (io.opentelemetry.api.common AttributeKey) + (io.opentelemetry.sdk.common CompletableResultCode) + (io.opentelemetry.sdk.trace.data SpanData) + (io.opentelemetry.sdk.trace.export SpanExporter) + (java.util.concurrent TimeUnit) + (java.util.concurrent.atomic AtomicBoolean))) -;; ------- -;; Colors +;; ------ +;; Colors (def colors "All ansi color codes that look good against black" @@ -29,7 +30,7 @@ (defn- error-color [s] (format "\033[1;37;41m%s\033[0m" s)) -(defn- uniq-color [s] +(defn- uniq-color [^String s] (let [n (.hashCode s) i (mod n (count colors))] (format "\033[1;38;5;%dm%s\033[0m" (colors i) s))) @@ -39,36 +40,31 @@ s (color-f s))) -(defn duration-ms [span] +(defn duration-ms [^SpanData span] (let [start (.getStartEpochNanos span) end (.getEndEpochNanos span)] (.toMillis (TimeUnit/NANOSECONDS) (- end start)))) -(def exclude-ks #{"SampleRate" - "thread.name" - "thread.id" - "code.lineno" - "code.namespace" - "code.filepath" - "host.name" - "detailed_query" - "detailed_patterns" - "detailed_tx_steps" - "process_id" - "query"}) - -(defn exclude? [[k]] - (or (exclude-ks k) - ;; `detailed_` columns in our logs are just - ;; too noisy. It's still nice to have in honeycomb, - ;; but it distracts in stdout. - (string/starts-with? k "detailed_") - ;; `jvm.` columns are used to associate metrics to - ;; every span. This is too noisy for stdout - (string/starts-with? k "jvm.") - ;; gauge metrics for a namespace - (string/starts-with? k "instant."))) +(defn exclude? [k] + (case k + ("SampleRate" + "thread.name" + "thread.id" + "code.lineno" + "code.namespace" + "code.filepath" + "host.name" + "detailed_query" + "detailed_patterns" + "detailed_tx_steps" + "process_id" + "query") true + (or + ;; `detailed_` columns in our logs are just + ;; too noisy. It's still nice to have in honeycomb, + ;; but it distracts in stdout. + (string/starts-with? k "detailed_")))) (defn format-attr-value "Formats attr values for logs." @@ -79,20 +75,18 @@ v)) (defn attr-str [attrs] - (->> attrs - (map (fn [[k v]] [(str k) v])) - (remove exclude?) - (map (fn [[k v]] - (format "%s=%s" - (if (= k "exception.message") - (colorize error-color k) - k) - (format-attr-value v)))) - (interpose " ") - string/join)) - -(defn event-str [span-event] - (attr-str (.asMap (.getAttributes span-event)))) + (let [sb (StringBuilder.)] + (doseq [[k v] attrs + :let [k (str k)] + :when (not (exclude? k))] + (.append sb k) + (.append sb "=") + (.append sb (format-attr-value v)) + (.append sb " ")) + (.toString sb))) + +(defn event-str [^SpanData span] + (attr-str (.getAttributes span))) (defn friendly-trace [trace-id] (if (seq trace-id) @@ -101,30 +95,35 @@ (subs trace-id 0 4)) "unk")) -(defn escape-newlines [s] - (string/replace s #"\n" "\\\\n")) - -(defn span-str [span] - (let [attr-str (attr-str (.getAttributes span)) - event-strs (map event-str (.getEvents span)) - data-str (string/join - " " - (into [attr-str] event-strs))] - (format "[%s] %sms [%s] %s" - (colorize uniq-color (friendly-trace (.getTraceId span))) - (duration-ms span) - (colorize uniq-color (.getName span)) - (cond-> data-str - (= :prod (config/get-env)) escape-newlines)))) +(defn escape-newlines [^String s] + (.replace s "\n" "\\\\n")) + +(def span-str + (if (= :prod (config/get-env)) + (fn [^SpanData span] + (let [attr-str (attr-str (.getAttributes span))] + (format "[%s] %sms [%s] %s" + (.getTraceId span) + (duration-ms span) + (.getName span) + (escape-newlines attr-str)))) + (fn [^SpanData span] + (let [attr-str (attr-str (.getAttributes span))] + (format "[%s] %sms [%s] %s" + (colorize uniq-color (friendly-trace (.getTraceId span))) + (duration-ms span) + (colorize uniq-color (.getName span)) + attr-str))))) (def op-attr-key (AttributeKey/stringKey "op")) (def exclude-span? (if (= :prod (config/get-env)) - (fn [^Span span] + (fn [^SpanData span] (let [n (.getName span)] (case n ("gc" + "gauges" "ws/send-json!" "handle-refresh/send-event!" "store/record-datalog-query-finish!" @@ -148,17 +147,20 @@ false) (string/starts-with? n "e2e")))) - (fn [span] + (fn [^SpanData span] (let [n (.getName span)] - (or (= n "gc") - (string/starts-with? n "e2e")))))) + (case n + ("gc" + "gauges") true + + (string/starts-with? n "e2e")))))) (defn log-spans [spans] (doseq [span spans :when (not (exclude-span? span))] (log/info (span-str span)))) -(defn export [shutdown? spans] +(defn export [^AtomicBoolean shutdown? spans] (if (.get shutdown?) (CompletableResultCode/ofFailure) (do (log-spans spans) diff --git a/server/src/instant/util/tracer.clj b/server/src/instant/util/tracer.clj index 42d4f868e..47412e7a8 100644 --- a/server/src/instant/util/tracer.clj +++ b/server/src/instant/util/tracer.clj @@ -14,7 +14,8 @@ (io.opentelemetry.sdk OpenTelemetrySdk) (io.opentelemetry.sdk.resources Resource) (io.opentelemetry.sdk.trace SdkTracer SdkTracerProvider) - (io.opentelemetry.sdk.trace.export BatchSpanProcessor SimpleSpanProcessor))) + (io.opentelemetry.sdk.trace.export BatchSpanProcessor SimpleSpanProcessor) + (java.util.concurrent TimeUnit))) (def ^:dynamic *span* nil) @@ -46,7 +47,11 @@ [honeycomb-api-key] (let [trace-provider-builder (SdkTracerProvider/builder) sdk-builder (OpenTelemetrySdk/builder) - log-processor (SimpleSpanProcessor/create (logging-exporter/create)) + log-processor (if (= :prod (config/get-env)) + (let [builder (BatchSpanProcessor/builder (logging-exporter/create))] + (.setScheduleDelay builder 500 java.util.concurrent.TimeUnit/MILLISECONDS) + (.build builder)) + (SimpleSpanProcessor/create (logging-exporter/create))) otlp-builder (OtlpGrpcSpanExporter/builder) resource (.merge (Resource/getDefault) (Resource/create (Attributes/of (AttributeKey/stringKey "service.name") @@ -73,21 +78,14 @@ (make-log-only-sdk))] (reset! tracer (.getTracer sdk "instant-server")))) -;; Stores metrics calculated by instant.gauges -;; These metrics are attached to every span. -;; Note: adding columns to spans are free in Honeycomb. -;; Having metrics on each span is a good way to observe changes. -(defonce last-calculated-metrics (atom {})) - (defn new-span! [{span-name :name :keys [attributes source] :as params}] (when-not span-name (throw (Exception. (format "Expected a map with :name key, got %s." params)))) (let [thread (Thread/currentThread) {:keys [code-ns code-line code-file]} source - default-attributes (cond-> @last-calculated-metrics - true (assoc "host.name" @config/hostname - "process-id" @config/process-id) + default-attributes (cond-> {"host.name" @config/hostname + "process-id" @config/process-id} thread (assoc "thread.name" (.getName thread) "thread.id" @@ -99,10 +97,10 @@ (-> (get-tracer) (.spanBuilder (name span-name)) (cond-> - *span* (.setParent (-> (Context/current) - (.with *span*))) - :always (.setAllAttributes (attr/->attributes attributes')) - (not *span*) .setNoParent) + *span* (.setParent (-> (Context/current) + (.with *span*))) + :always (.setAllAttributes (attr/->attributes attributes')) + (not *span*) .setNoParent) .startSpan))) (def ^:private keyword->StatusCode