Skip to content

Commit e9554ad

Browse files
ilianasmklein
andauthored
add disk metrics endpoint (#1348)
* add disk metrics endpoint * Fix tags, openapi * Use volume ID as upstairs ID * Make clippy happy * Add integration test for metrics collection * Unprivileged_access added to VerifyEndpoint * Add limits, fix pagination (hopefully. Tests incoming) * Add test for limit * Fix pagination, add tests * NotFound -> Empty Vec in Nexus * Fix merge Co-authored-by: Sean Klein <[email protected]>
1 parent 948e537 commit e9554ad

File tree

22 files changed

+1356
-35
lines changed

22 files changed

+1356
-35
lines changed

Cargo.lock

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

nexus/src/app/oximeter.rs

Lines changed: 117 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@ use crate::authz;
88
use crate::context::OpContext;
99
use crate::db;
1010
use crate::db::identity::Asset;
11+
use crate::external_api::params::ResourceMetrics;
1112
use crate::internal_api::params::OximeterInfo;
13+
use dropshot::PaginationParams;
1214
use internal_dns_client::{
1315
multiclient::{ResolveError, Resolver},
1416
names::{ServiceName, SRV},
@@ -21,6 +23,8 @@ use omicron_common::api::external::PaginationOrder;
2123
use omicron_common::api::internal::nexus;
2224
use omicron_common::backoff;
2325
use oximeter_client::Client as OximeterClient;
26+
use oximeter_db::query::Timestamp;
27+
use oximeter_db::Measurement;
2428
use oximeter_db::TimeseriesSchema;
2529
use oximeter_db::TimeseriesSchemaPaginationParams;
2630
use oximeter_producer::register;
@@ -212,14 +216,112 @@ impl super::Nexus {
212216
.map_err(|e| Error::internal_error(&e.to_string()))?
213217
.timeseries_schema_list(&pag_params.page, limit)
214218
.await
215-
.map_err(|e| match e {
216-
oximeter_db::Error::DatabaseUnavailable(_) => {
217-
Error::ServiceUnavailable {
218-
internal_message: e.to_string(),
219-
}
219+
.map_err(map_oximeter_err)
220+
}
221+
222+
/// Returns a results from the timeseries DB based on the provided query
223+
/// parameters.
224+
///
225+
/// * `timeseries_name`: The "target:metric" name identifying the metric to
226+
/// be queried.
227+
/// * `criteria`: Any additional parameters to help narrow down the query
228+
/// selection further. These parameters are passed directly to
229+
/// [oximeter::db::Client::select_timeseries_with].
230+
/// * `query_params`: Pagination parameter, identifying which page of
231+
/// results to return.
232+
/// * `limit`: The maximum number of results to return in a paginated
233+
/// request.
234+
pub async fn select_timeseries(
235+
&self,
236+
timeseries_name: &str,
237+
criteria: &[&str],
238+
query_params: PaginationParams<ResourceMetrics, ResourceMetrics>,
239+
limit: NonZeroU32,
240+
) -> Result<dropshot::ResultsPage<Measurement>, Error> {
241+
#[inline]
242+
fn no_results() -> dropshot::ResultsPage<Measurement> {
243+
dropshot::ResultsPage { next_page: None, items: Vec::new() }
244+
}
245+
246+
let (start_time, end_time, query) = match query_params.page {
247+
// Generally, we want the time bounds to be inclusive for the
248+
// start time, and exclusive for the end time...
249+
dropshot::WhichPage::First(query) => (
250+
Timestamp::Inclusive(query.start_time),
251+
Timestamp::Exclusive(query.end_time),
252+
query,
253+
),
254+
// ... but for subsequent pages, we use the "last observed"
255+
// timestamp as the start time. If we used an inclusive bound,
256+
// we'd duplicate the returned measurement. To return each
257+
// measurement exactly once, we make the start time "exclusive"
258+
// on all "next" pages.
259+
dropshot::WhichPage::Next(query) => (
260+
Timestamp::Exclusive(query.start_time),
261+
Timestamp::Exclusive(query.end_time),
262+
query,
263+
),
264+
};
265+
if query.start_time >= query.end_time {
266+
return Ok(no_results());
267+
}
268+
269+
let timeseries_list = self
270+
.timeseries_client
271+
.get()
272+
.await
273+
.map_err(|e| {
274+
Error::internal_error(&format!(
275+
"Cannot access timeseries DB: {}",
276+
e
277+
))
278+
})?
279+
.select_timeseries_with(
280+
timeseries_name,
281+
criteria,
282+
Some(start_time),
283+
Some(end_time),
284+
Some(limit),
285+
)
286+
.await
287+
.or_else(|err| {
288+
// If the timeseries name exists in the API, but not in Clickhouse,
289+
// it might just not have been populated yet.
290+
match err {
291+
oximeter_db::Error::TimeseriesNotFound(_) => Ok(vec![]),
292+
_ => Err(err),
220293
}
221-
_ => Error::InternalError { internal_message: e.to_string() },
222294
})
295+
.map_err(map_oximeter_err)?;
296+
297+
if timeseries_list.len() > 1 {
298+
return Err(Error::internal_error(&format!(
299+
"expected 1 timeseries but got {} ({:?} {:?})",
300+
timeseries_list.len(),
301+
timeseries_name,
302+
criteria
303+
)));
304+
}
305+
306+
// If we received no data, exit early.
307+
let timeseries =
308+
if let Some(timeseries) = timeseries_list.into_iter().next() {
309+
timeseries
310+
} else {
311+
return Ok(no_results());
312+
};
313+
314+
Ok(dropshot::ResultsPage::new(
315+
timeseries.measurements,
316+
&query,
317+
|last_measurement: &Measurement, query: &ResourceMetrics| {
318+
ResourceMetrics {
319+
start_time: last_measurement.timestamp(),
320+
end_time: query.end_time,
321+
}
322+
},
323+
)
324+
.unwrap())
223325
}
224326

225327
// Internal helper to build an Oximeter client from its ID and address (common data between
@@ -259,3 +361,12 @@ impl super::Nexus {
259361
Ok((self.build_oximeter_client(&id, address), id))
260362
}
261363
}
364+
365+
fn map_oximeter_err(error: oximeter_db::Error) -> Error {
366+
match error {
367+
oximeter_db::Error::DatabaseUnavailable(_) => {
368+
Error::ServiceUnavailable { internal_message: error.to_string() }
369+
}
370+
_ => Error::InternalError { internal_message: error.to_string() },
371+
}
372+
}

nexus/src/external_api/http_entrypoints.rs

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ use omicron_common::api::external::Saga;
6767
use omicron_common::api::external::VpcFirewallRuleUpdateParams;
6868
use omicron_common::api::external::VpcFirewallRules;
6969
use omicron_common::bail_unless;
70+
use parse_display::Display;
7071
use ref_cast::RefCast;
7172
use schemars::JsonSchema;
7273
use serde::Deserialize;
@@ -115,6 +116,7 @@ pub fn external_api() -> NexusApiDescription {
115116
api.register(disk_view)?;
116117
api.register(disk_view_by_id)?;
117118
api.register(disk_delete)?;
119+
api.register(disk_metrics_list)?;
118120

119121
api.register(instance_list)?;
120122
api.register(instance_create)?;
@@ -1515,6 +1517,65 @@ async fn disk_delete(
15151517
apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await
15161518
}
15171519

1520+
#[derive(Display, Deserialize, JsonSchema)]
1521+
#[display(style = "snake_case")]
1522+
#[serde(rename_all = "snake_case")]
1523+
pub enum DiskMetricName {
1524+
Activated,
1525+
Flush,
1526+
Read,
1527+
ReadBytes,
1528+
Write,
1529+
WriteBytes,
1530+
}
1531+
1532+
/// Fetch metrics for a disk.
1533+
#[endpoint {
1534+
method = GET,
1535+
path = "/organizations/{organization_name}/projects/{project_name}/disks/{disk_name}/metrics/{metric_name}",
1536+
tags = ["disks"],
1537+
}]
1538+
async fn disk_metrics_list(
1539+
rqctx: Arc<RequestContext<Arc<ServerContext>>>,
1540+
path_params: Path<MetricsPathParam<DiskPathParam, DiskMetricName>>,
1541+
query_params: Query<
1542+
PaginationParams<params::ResourceMetrics, params::ResourceMetrics>,
1543+
>,
1544+
) -> Result<HttpResponseOk<ResultsPage<oximeter_db::Measurement>>, HttpError> {
1545+
let apictx = rqctx.context();
1546+
let nexus = &apictx.nexus;
1547+
1548+
let path = path_params.into_inner();
1549+
let organization_name = &path.inner.organization_name;
1550+
let project_name = &path.inner.project_name;
1551+
let disk_name = &path.inner.disk_name;
1552+
let metric_name = path.metric_name;
1553+
1554+
let query = query_params.into_inner();
1555+
let limit = rqctx.page_limit(&query)?;
1556+
1557+
let handler = async {
1558+
let opctx = OpContext::for_external_api(&rqctx).await?;
1559+
1560+
// This ensures the user is authorized on Action::Read for this disk
1561+
let disk = nexus
1562+
.disk_fetch(&opctx, organization_name, project_name, disk_name)
1563+
.await?;
1564+
let upstairs_uuid = disk.id();
1565+
let result = nexus
1566+
.select_timeseries(
1567+
&format!("crucible_upstairs:{}", metric_name),
1568+
&[&format!("upstairs_uuid=={}", upstairs_uuid)],
1569+
query,
1570+
limit,
1571+
)
1572+
.await?;
1573+
1574+
Ok(HttpResponseOk(result))
1575+
};
1576+
apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await
1577+
}
1578+
15181579
// Instances
15191580

15201581
/// List instances in a project.
@@ -4093,6 +4154,15 @@ async fn session_sshkey_delete(
40934154
apictx.external_latencies.instrument_dropshot_handler(&rqctx, handler).await
40944155
}
40954156

4157+
/// Path parameters for metrics requests where `/metrics/{metric_name}` is
4158+
/// appended to an existing path parameter type
4159+
#[derive(Deserialize, JsonSchema)]
4160+
struct MetricsPathParam<T, M> {
4161+
#[serde(flatten)]
4162+
inner: T,
4163+
metric_name: M,
4164+
}
4165+
40964166
#[cfg(test)]
40974167
mod test {
40984168
use super::external_api;

nexus/test-utils/src/lib.rs

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ pub async fn test_setup_with_config(
159159
)
160160
.await
161161
.unwrap();
162+
register_test_producer(&producer).unwrap();
162163

163164
ControlPlaneTestContext {
164165
server,
@@ -253,6 +254,10 @@ impl oximeter::Producer for IntegrationProducer {
253254
}
254255
}
255256

257+
/// Creates and starts a producer server.
258+
///
259+
/// Actual producers can be registered with the [`register_producer`]
260+
/// helper function.
256261
pub async fn start_producer_server(
257262
nexus_address: SocketAddr,
258263
id: Uuid,
@@ -281,9 +286,22 @@ pub async fn start_producer_server(
281286
};
282287
let server =
283288
ProducerServer::start(&config).await.map_err(|e| e.to_string())?;
289+
Ok(server)
290+
}
284291

292+
/// Registers an arbitrary producer with the test server.
293+
pub fn register_producer(
294+
server: &ProducerServer,
295+
producer: impl oximeter::Producer,
296+
) -> Result<(), String> {
297+
server.registry().register_producer(producer).map_err(|e| e.to_string())?;
298+
Ok(())
299+
}
300+
301+
/// Registers a sample-generating test-specific producer.
302+
pub fn register_test_producer(server: &ProducerServer) -> Result<(), String> {
285303
// Create and register an actual metric producer.
286-
let producer = IntegrationProducer {
304+
let test_producer = IntegrationProducer {
287305
target: IntegrationTarget {
288306
name: "integration-test-target".to_string(),
289307
},
@@ -292,8 +310,7 @@ pub async fn start_producer_server(
292310
datum: 0,
293311
},
294312
};
295-
server.registry().register_producer(producer).map_err(|e| e.to_string())?;
296-
Ok(server)
313+
register_producer(server, test_producer)
297314
}
298315

299316
/// Returns whether the two identity metadata objects are identical.

nexus/test-utils/src/resource_helpers.rs

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -188,28 +188,35 @@ pub async fn create_disk(
188188
.await
189189
}
190190

191+
/// Creates an instance with a default NIC and no disks.
192+
///
193+
/// Wrapper around [`create_instance_with`].
191194
pub async fn create_instance(
192195
client: &ClientTestContext,
193196
organization_name: &str,
194197
project_name: &str,
195198
instance_name: &str,
196199
) -> Instance {
197-
create_instance_with_nics(
200+
create_instance_with(
198201
client,
199202
organization_name,
200203
project_name,
201204
instance_name,
202205
&params::InstanceNetworkInterfaceAttachment::Default,
206+
// Disks=
207+
vec![],
203208
)
204209
.await
205210
}
206211

207-
pub async fn create_instance_with_nics(
212+
/// Creates an instance with attached resou8rces.
213+
pub async fn create_instance_with(
208214
client: &ClientTestContext,
209215
organization_name: &str,
210216
project_name: &str,
211217
instance_name: &str,
212218
nics: &params::InstanceNetworkInterfaceAttachment,
219+
disks: Vec<params::InstanceDiskAttachment>,
213220
) -> Instance {
214221
let url = format!(
215222
"/organizations/{}/projects/{}/instances",
@@ -231,7 +238,7 @@ pub async fn create_instance_with_nics(
231238
.to_vec(),
232239
network_interfaces: nics.clone(),
233240
external_ips: vec![],
234-
disks: vec![],
241+
disks,
235242
},
236243
)
237244
.await

0 commit comments

Comments
 (0)