Skip to content

Commit

Permalink
only send gauges periodically
Browse files Browse the repository at this point in the history
  • Loading branch information
dwwoelfel committed Jan 14, 2025
1 parent 1f61651 commit 0617d36
Show file tree
Hide file tree
Showing 3 changed files with 107 additions and 98 deletions.
37 changes: 23 additions & 14 deletions server/src/instant/gauges.clj
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
"Repeatedly saves metric information in `tracer/last-calculated-metrics`."
(:require
[clojure.string :as str]
[instant.util.tracer :as tracer]
[instant.util.delay :as delay])
[instant.config :as config]
[instant.util.delay :as delay]
[instant.util.tracer :as tracer])
(:import
(java.lang.management ManagementFactory GarbageCollectorMXBean)
(javax.management NotificationEmitter NotificationListener)
(com.sun.management GarbageCollectionNotificationInfo)
(java.lang.management GarbageCollectorMXBean ManagementFactory ThreadInfo)
(java.util.concurrent ScheduledFuture ThreadPoolExecutor)
(javax.management NotificationEmitter NotificationListener)
(javax.management.openmbean CompositeData)))

(defonce gauge-metric-fns (atom {}))
Expand Down Expand Up @@ -71,13 +73,15 @@
:value (-> thread .getThreadCount)}
{:path "jvm.thread.daemon.count"
:value (-> thread .getDaemonThreadCount)}
(for [thread-state (Thread$State/values)]
{:path (str "jvm.thread." (-> thread-state str str/lower-case) ".count")
:value (count
(filter #(and % (= thread-state (.getThreadState %)))
(.getThreadInfo thread
(-> thread .getAllThreadIds))))})
(for [[executor description]
(let [thread-groups (group-by (fn [^ThreadInfo thread-info]
(when thread-info
(.getThreadState thread-info)))
(.getThreadInfo thread
(.getAllThreadIds thread)))]
(for [thread-state (Thread$State/values)]
{:path (str "jvm.thread." (-> thread-state str str/lower-case) ".count")
:value (count (get thread-groups thread-state))}))
(for [[^ThreadPoolExecutor executor description]
[[solo-executor "agent-pool.send-off"]
[pooled-executor "agent-pool.send"]]]
[{:path (str "jvm." description ".queue-depth")
Expand Down Expand Up @@ -109,7 +113,12 @@

(defn straight-jacket-record-gauges []
(try
(reset! tracer/last-calculated-metrics (gauges))
(if (= :prod (config/get-env))
(tracer/record-info! {:name "gauges"
:attributes (gauges)})
;; Run them in dev so any issues will bubble up, but don't bother
;; sending anything to honeycomb
(gauges))
(catch Throwable t
(tracer/record-exception-span! t {:name "gauges/straight-jacket-record-gauges"}))))

Expand Down Expand Up @@ -147,13 +156,13 @@
(defn start []
(tracer/record-info! {:name "gauges/start"})
(def record-job (delay/repeat-fn delay-pool
1000
30000 ;; 30 seconds
#'straight-jacket-record-gauges))
(def cleanup-gc-listeners (add-gc-listeners)))

(defn stop []
(when (bound? #'record-job)
(.cancel record-job true))
(.cancel ^ScheduledFuture record-job true))
(when (bound? #'cleanup-gc-listeners)
(cleanup-gc-listeners)))

Expand Down
140 changes: 71 additions & 69 deletions server/src/instant/util/logging_exporter.clj
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
(ns instant.util.logging-exporter
"Exporter that logs spans to stdout. In development we also colorize the logs"
(:require
[clojure.tools.logging :as log]
[clojure.string :as string]
[clojure.tools.logging :as log]
[instant.config :as config])
(:import (io.opentelemetry.api.common AttributeKey)
(io.opentelemetry.api.trace Span)
(io.opentelemetry.sdk.common CompletableResultCode)
(io.opentelemetry.sdk.trace.export SpanExporter)
(java.util.concurrent TimeUnit)
(java.util.concurrent.atomic AtomicBoolean)))
(:import
(io.opentelemetry.api.common AttributeKey)
(io.opentelemetry.sdk.common CompletableResultCode)
(io.opentelemetry.sdk.trace.data SpanData)
(io.opentelemetry.sdk.trace.export SpanExporter)
(java.util.concurrent TimeUnit)
(java.util.concurrent.atomic AtomicBoolean)))

;; -------
;; Colors
;; ------
;; Colors

(def colors
"All ansi color codes that look good against black"
Expand All @@ -29,7 +30,7 @@
(defn- error-color [s]

Check warning on line 30 in server/src/instant/util/logging_exporter.clj

View workflow job for this annotation

GitHub Actions / lint

Unused private var instant.util.logging-exporter/error-color
(format "\033[1;37;41m%s\033[0m" s))

(defn- uniq-color [s]
(defn- uniq-color [^String s]
(let [n (.hashCode s)
i (mod n (count colors))]
(format "\033[1;38;5;%dm%s\033[0m" (colors i) s)))
Expand All @@ -39,36 +40,31 @@
s
(color-f s)))

(defn duration-ms [span]
(defn duration-ms [^SpanData span]
(let [start (.getStartEpochNanos span)
end (.getEndEpochNanos span)]
(.toMillis (TimeUnit/NANOSECONDS)
(- end start))))

(def exclude-ks #{"SampleRate"
"thread.name"
"thread.id"
"code.lineno"
"code.namespace"
"code.filepath"
"host.name"
"detailed_query"
"detailed_patterns"
"detailed_tx_steps"
"process_id"
"query"})

(defn exclude? [[k]]
(or (exclude-ks k)
;; `detailed_` columns in our logs are just
;; too noisy. It's still nice to have in honeycomb,
;; but it distracts in stdout.
(string/starts-with? k "detailed_")
;; `jvm.` columns are used to associate metrics to
;; every span. This is too noisy for stdout
(string/starts-with? k "jvm.")
;; gauge metrics for a namespace
(string/starts-with? k "instant.")))
(defn exclude? [k]
(case k
("SampleRate"
"thread.name"
"thread.id"
"code.lineno"
"code.namespace"
"code.filepath"
"host.name"
"detailed_query"
"detailed_patterns"
"detailed_tx_steps"
"process_id"
"query") true
(or

Check warning on line 63 in server/src/instant/util/logging_exporter.clj

View workflow job for this annotation

GitHub Actions / lint

Single arg use of or always returns the arg itself
;; `detailed_` columns in our logs are just
;; too noisy. It's still nice to have in honeycomb,
;; but it distracts in stdout.
(string/starts-with? k "detailed_"))))

(defn format-attr-value
"Formats attr values for logs."
Expand All @@ -79,20 +75,18 @@
v))

(defn attr-str [attrs]
(->> attrs
(map (fn [[k v]] [(str k) v]))
(remove exclude?)
(map (fn [[k v]]
(format "%s=%s"
(if (= k "exception.message")
(colorize error-color k)
k)
(format-attr-value v))))
(interpose " ")
string/join))

(defn event-str [span-event]
(attr-str (.asMap (.getAttributes span-event))))
(let [sb (StringBuilder.)]
(doseq [[k v] attrs
:let [k (str k)]
:when (not (exclude? k))]
(.append sb k)
(.append sb "=")
(.append sb (format-attr-value v))
(.append sb " "))
(.toString sb)))

(defn event-str [^SpanData span]
(attr-str (.getAttributes span)))

(defn friendly-trace [trace-id]
(if (seq trace-id)
Expand All @@ -101,30 +95,35 @@
(subs trace-id 0 4))
"unk"))

(defn escape-newlines [s]
(string/replace s #"\n" "\\\\n"))

(defn span-str [span]
(let [attr-str (attr-str (.getAttributes span))
event-strs (map event-str (.getEvents span))
data-str (string/join
" "
(into [attr-str] event-strs))]
(format "[%s] %sms [%s] %s"
(colorize uniq-color (friendly-trace (.getTraceId span)))
(duration-ms span)
(colorize uniq-color (.getName span))
(cond-> data-str
(= :prod (config/get-env)) escape-newlines))))
(defn escape-newlines [^String s]
(.replace s "\n" "\\\\n"))

(def span-str
(if (= :prod (config/get-env))
(fn [^SpanData span]
(let [attr-str (attr-str (.getAttributes span))]
(format "[%s] %sms [%s] %s"
(.getTraceId span)
(duration-ms span)
(.getName span)
(escape-newlines attr-str))))
(fn [^SpanData span]
(let [attr-str (attr-str (.getAttributes span))]
(format "[%s] %sms [%s] %s"
(colorize uniq-color (friendly-trace (.getTraceId span)))
(duration-ms span)
(colorize uniq-color (.getName span))
attr-str)))))

(def op-attr-key (AttributeKey/stringKey "op"))

(def exclude-span?
(if (= :prod (config/get-env))
(fn [^Span span]
(fn [^SpanData span]
(let [n (.getName span)]
(case n
("gc"
"gauges"
"ws/send-json!"
"handle-refresh/send-event!"
"store/record-datalog-query-finish!"
Expand All @@ -148,17 +147,20 @@
false)

(string/starts-with? n "e2e"))))
(fn [span]
(fn [^SpanData span]
(let [n (.getName span)]
(or (= n "gc")
(string/starts-with? n "e2e"))))))
(case n
("gc"
"gauges") true

(string/starts-with? n "e2e"))))))

(defn log-spans [spans]
(doseq [span spans
:when (not (exclude-span? span))]
(log/info (span-str span))))

(defn export [shutdown? spans]
(defn export [^AtomicBoolean shutdown? spans]
(if (.get shutdown?)
(CompletableResultCode/ofFailure)
(do (log-spans spans)
Expand Down
28 changes: 13 additions & 15 deletions server/src/instant/util/tracer.clj
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
(io.opentelemetry.sdk OpenTelemetrySdk)
(io.opentelemetry.sdk.resources Resource)
(io.opentelemetry.sdk.trace SdkTracer SdkTracerProvider)
(io.opentelemetry.sdk.trace.export BatchSpanProcessor SimpleSpanProcessor)))
(io.opentelemetry.sdk.trace.export BatchSpanProcessor SimpleSpanProcessor)
(java.util.concurrent TimeUnit)))

Check warning on line 18 in server/src/instant/util/tracer.clj

View workflow job for this annotation

GitHub Actions / lint

Unused import TimeUnit

(def ^:dynamic *span* nil)

Expand Down Expand Up @@ -46,7 +47,11 @@
[honeycomb-api-key]
(let [trace-provider-builder (SdkTracerProvider/builder)
sdk-builder (OpenTelemetrySdk/builder)
log-processor (SimpleSpanProcessor/create (logging-exporter/create))
log-processor (if (= :prod (config/get-env))
(let [builder (BatchSpanProcessor/builder (logging-exporter/create))]
(.setScheduleDelay builder 500 java.util.concurrent.TimeUnit/MILLISECONDS)
(.build builder))
(SimpleSpanProcessor/create (logging-exporter/create)))
otlp-builder (OtlpGrpcSpanExporter/builder)
resource (.merge (Resource/getDefault)
(Resource/create (Attributes/of (AttributeKey/stringKey "service.name")
Expand All @@ -73,21 +78,14 @@
(make-log-only-sdk))]
(reset! tracer (.getTracer sdk "instant-server"))))

;; Stores metrics calculated by instant.gauges
;; These metrics are attached to every span.
;; Note: adding columns to spans are free in Honeycomb.
;; Having metrics on each span is a good way to observe changes.
(defonce last-calculated-metrics (atom {}))

(defn new-span!
[{span-name :name :keys [attributes source] :as params}]
(when-not span-name
(throw (Exception. (format "Expected a map with :name key, got %s." params))))
(let [thread (Thread/currentThread)
{:keys [code-ns code-line code-file]} source
default-attributes (cond-> @last-calculated-metrics
true (assoc "host.name" @config/hostname
"process-id" @config/process-id)
default-attributes (cond-> {"host.name" @config/hostname
"process-id" @config/process-id}
thread (assoc "thread.name"
(.getName thread)
"thread.id"
Expand All @@ -99,10 +97,10 @@
(-> (get-tracer)
(.spanBuilder (name span-name))
(cond->
*span* (.setParent (-> (Context/current)
(.with *span*)))
:always (.setAllAttributes (attr/->attributes attributes'))
(not *span*) .setNoParent)
*span* (.setParent (-> (Context/current)
(.with *span*)))
:always (.setAllAttributes (attr/->attributes attributes'))
(not *span*) .setNoParent)
.startSpan)))

(def ^:private keyword->StatusCode
Expand Down

0 comments on commit 0617d36

Please sign in to comment.