diff --git a/Cargo.lock b/Cargo.lock index 84ab24ae..00e7777b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -402,6 +402,16 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam-channel" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e54ea8bc3fb1ee042f5aace6e3c6e025d3874866da222930f70ce62aceba0bfa" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + [[package]] name = "crossbeam-epoch" version = "0.9.5" @@ -454,6 +464,16 @@ dependencies = [ "sct", ] +[[package]] +name = "dashmap" +version = "4.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e77a43b28d0668df09411cb0bc9a8c2adc40f9a048afe863e05fd43251e8e39c" +dependencies = [ + "cfg-if", + "num_cpus", +] + [[package]] name = "digest" version = "0.9.0" @@ -666,6 +686,7 @@ dependencies = [ "bytes 1.1.0", "futures", "futures-util", + "garage_admin", "garage_api", "garage_model 0.6.0", "garage_rpc 0.6.0", @@ -690,6 +711,23 @@ dependencies = [ "toml", ] +[[package]] +name = "garage_admin" +version = "0.6.0" +dependencies = [ + "futures", + "futures-util", + "garage_model 0.6.0", + "garage_util 0.6.0", + "http", + "hyper", + "lazy_static", + "log", + "opentelemetry", + "opentelemetry-prometheus", + "prometheus", +] + [[package]] name = "garage_api" version = "0.6.0" @@ -1476,6 +1514,38 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" +[[package]] +name = "opentelemetry" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6105e89802af13fdf48c49d7646d3b533a70e536d818aae7e78ba0433d01acb8" +dependencies = [ + "async-trait", + "crossbeam-channel", + "dashmap", + "fnv", + "futures-channel", + "futures-executor", + "futures-util", + "js-sys", + "lazy_static", + "percent-encoding", + "pin-project", + "rand", + "thiserror", +] + +[[package]] +name = "opentelemetry-prometheus" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9328977e479cebe12ce0d3fcecdaea4721d234895a9440c5b5dfd113f0594ac6" +dependencies = [ + "opentelemetry", + "prometheus", + "protobuf", +] + [[package]] name = "parking_lot" version = "0.11.2" @@ -1606,6 +1676,27 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "prometheus" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7f64969ffd5dd8f39bd57a68ac53c163a095ed9d0fb707146da1b27025a3504" +dependencies = [ + "cfg-if", + "fnv", + "lazy_static", + "memchr", + "parking_lot", + "protobuf", + "thiserror", +] + +[[package]] +name = "protobuf" +version = "2.27.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf7e6d18738ecd0902d30d1ad232c9125985a3422929b16c65517b38adc14f96" + [[package]] name = "quick-error" version = "1.2.3" diff --git a/Cargo.toml b/Cargo.toml index 739e698e..88c8ad7d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,9 +4,10 @@ members = [ "src/rpc", "src/table", "src/model", + "src/admin", "src/api", "src/web", - "src/garage", + "src/garage" ] [profile.dev] diff --git a/script/dev-cluster.sh b/script/dev-cluster.sh index c1ffb355..5cc003ef 100755 --- a/script/dev-cluster.sh +++ b/script/dev-cluster.sh @@ -44,6 +44,9 @@ root_domain = ".s3.garage.localhost" bind_addr = "0.0.0.0:$((3920+$count))" root_domain = ".web.garage.localhost" index = "index.html" + +[admin_api] +bind_addr = "0.0.0.0:$((9900+$count))" EOF echo -en "$LABEL configuration written to $CONF_PATH\n" diff --git a/src/admin/Cargo.toml b/src/admin/Cargo.toml new file mode 100644 index 00000000..9775b667 --- /dev/null +++ b/src/admin/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "garage_admin" +version = "0.6.0" +authors = ["Maximilien Richer "] +edition = "2018" +license = "AGPL-3.0" +description = "Administration and metrics REST HTTP server for Garage" +repository = "https://git.deuxfleurs.fr/Deuxfleurs/garage" + +[lib] +path = "lib.rs" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +garage_model = { version = "0.6.0", path = "../model" } +garage_util = { version = "0.6.0", path = "../util" } + +futures = "0.3" +futures-util = "0.3" +http = "0.2" +hyper = "0.14" +log = "0.4" + +opentelemetry = "0.17" +opentelemetry-prometheus = "0.10" +prometheus = "0.13" +lazy_static = "1.4" diff --git a/src/admin/lib.rs b/src/admin/lib.rs new file mode 100644 index 00000000..443361be --- /dev/null +++ b/src/admin/lib.rs @@ -0,0 +1,6 @@ +//! Crate for handling the admin and metric HTTP APIs +#[macro_use] +extern crate log; +extern crate lazy_static; + +pub mod metrics; diff --git a/src/admin/metrics.rs b/src/admin/metrics.rs new file mode 100644 index 00000000..547ee4c8 --- /dev/null +++ b/src/admin/metrics.rs @@ -0,0 +1,141 @@ +use hyper::{ + header::CONTENT_TYPE, + service::{make_service_fn, service_fn}, + Body, Method, Request, Response, Server, +}; +use lazy_static::lazy_static; +use opentelemetry::{ + global, + metrics::{BoundCounter, BoundValueRecorder}, + KeyValue, +}; +use opentelemetry_prometheus::PrometheusExporter; +use prometheus::{Encoder, TextEncoder}; +use std::convert::Infallible; +use std::sync::Arc; +use std::time::SystemTime; + +use futures::future::*; +use garage_model::garage::Garage; +use garage_util::error::Error as GarageError; + +lazy_static! { + // This defines the differennt tags that will be referenced by the object + static ref HANDLER_ALL: [KeyValue; 1] = [KeyValue::new("handler", "all")]; +} + +// serve_req on metric endpoint +async fn serve_req( + req: Request, + admin_server: Arc, +) -> Result, hyper::Error> { + println!("Receiving request at path {}", req.uri()); + let request_start = SystemTime::now(); + + admin_server.metrics.http_counter.add(1); + + let response = match (req.method(), req.uri().path()) { + (&Method::GET, "/metrics") => { + let mut buffer = vec![]; + let encoder = TextEncoder::new(); + let metric_families = admin_server.exporter.registry().gather(); + encoder.encode(&metric_families, &mut buffer).unwrap(); + admin_server + .metrics + .http_body_gauge + .record(buffer.len() as u64); + + Response::builder() + .status(200) + .header(CONTENT_TYPE, encoder.format_type()) + .body(Body::from(buffer)) + .unwrap() + } + _ => Response::builder() + .status(404) + .body(Body::from("Not implemented")) + .unwrap(), + }; + + admin_server + .metrics + .http_req_histogram + .record(request_start.elapsed().map_or(0.0, |d| d.as_secs_f64())); + Ok(response) +} + +// AdminServer hold the admin server internal admin_server and the metric exporter +pub struct AdminServer { + exporter: PrometheusExporter, + metrics: AdminServerMetrics, +} + +// GarageMetricadmin_server holds the metrics counter definition for Garage +// FIXME: we would rather have that split up among the different libraries? +struct AdminServerMetrics { + http_counter: BoundCounter, + http_body_gauge: BoundValueRecorder, + http_req_histogram: BoundValueRecorder, + bucket_v2_merkle_updater_todo_queue_length: BoundValueRecorder, +} + +impl AdminServer { + /// init initilialize the AdminServer and background metric server + pub fn init() -> AdminServer { + let exporter = opentelemetry_prometheus::exporter().init(); + let meter = global::meter("garage/admin_server"); + AdminServer { + exporter, + metrics: AdminServerMetrics { + http_counter: meter + .u64_counter("router.http_requests_total") + .with_description("Total number of HTTP requests made.") + .init() + .bind(HANDLER_ALL.as_ref()), + http_body_gauge: meter + .u64_value_recorder("example.http_response_size_bytes") + .with_description("The metrics HTTP response sizes in bytes.") + .init() + .bind(HANDLER_ALL.as_ref()), + http_req_histogram: meter + .f64_value_recorder("example.http_request_duration_seconds") + .with_description("The HTTP request latencies in seconds.") + .init() + .bind(HANDLER_ALL.as_ref()), + bucket_v2_merkle_updater_todo_queue_length: meter + .f64_value_recorder("bucket_v2.merkle_updater.todo_queue_length") + .with_description("Bucket merkle updater TODO queue length.") + .init() + .bind(HANDLER_ALL.as_ref()), + }, + } + } + /// run execute the admin server on the designated HTTP port and listen for requests + pub async fn run( + self, + garage: Arc, + shutdown_signal: impl Future, + ) -> Result<(), GarageError> { + let admin_server = Arc::new(self); + // For every connection, we must make a `Service` to handle all + // incoming HTTP requests on said connection. + let make_svc = make_service_fn(move |_conn| { + let admin_server = admin_server.clone(); + // This is the `Service` that will handle the connection. + // `service_fn` is a helper to convert a function that + // returns a Response into a `Service`. + async move { + Ok::<_, Infallible>(service_fn(move |req| serve_req(req, admin_server.clone()))) + } + }); + + let addr = &garage.config.admin_api.bind_addr; + + let server = Server::bind(&addr).serve(make_svc); + let graceful = server.with_graceful_shutdown(shutdown_signal); + info!("Admin server listening on http://{}", addr); + + graceful.await?; + Ok(()) + } +} diff --git a/src/garage/Cargo.toml b/src/garage/Cargo.toml index d6034bbd..e4304b96 100644 --- a/src/garage/Cargo.toml +++ b/src/garage/Cargo.toml @@ -27,6 +27,7 @@ garage_rpc = { version = "0.6.0", path = "../rpc" } garage_table = { version = "0.6.0", path = "../table" } garage_util = { version = "0.6.0", path = "../util" } garage_web = { version = "0.6.0", path = "../web" } +garage_admin = { version = "0.6.0", path = "../admin" } bytes = "1.0" git-version = "0.3.4" diff --git a/src/garage/server.rs b/src/garage/server.rs index f4d62e91..923df1cd 100644 --- a/src/garage/server.rs +++ b/src/garage/server.rs @@ -6,6 +6,7 @@ use garage_util::background::*; use garage_util::config::*; use garage_util::error::Error; +use garage_admin::metrics::*; use garage_api::run_api_server; use garage_model::garage::Garage; use garage_web::run_web_server; @@ -34,6 +35,9 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> { .open() .expect("Unable to open sled DB"); + info!("Configure and run admin web server..."); + let admin_server_init = AdminServer::init(); + info!("Initializing background runner..."); let watch_cancel = netapp::util::watch_ctrl_c(); let (background, await_background_done) = BackgroundRunner::new(16, watch_cancel.clone()); @@ -43,7 +47,7 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> { let run_system = tokio::spawn(garage.system.clone().run(watch_cancel.clone())); - info!("Crate admin RPC handler..."); + info!("Create admin RPC handler..."); AdminRpcHandler::new(garage.clone()); info!("Initializing API server..."); @@ -58,6 +62,10 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> { wait_from(watch_cancel.clone()), )); + info!("Configure and run admin web server..."); + let admin_server = + tokio::spawn(admin_server_init.run(garage.clone(), wait_from(watch_cancel.clone()))); + // Stuff runs // When a cancel signal is sent, stuff stops @@ -67,6 +75,9 @@ pub async fn run_server(config_file: PathBuf) -> Result<(), Error> { if let Err(e) = web_server.await? { warn!("Web server exited with error: {}", e); } + if let Err(e) = admin_server.await? { + warn!("Admin web server exited with error: {}", e); + } // Remove RPC handlers for system to break reference cycles garage.system.netapp.drop_all_handlers(); diff --git a/src/util/config.rs b/src/util/config.rs index f1f4b06a..c88795e5 100644 --- a/src/util/config.rs +++ b/src/util/config.rs @@ -66,6 +66,9 @@ pub struct Config { /// Configuration for serving files as normal web server pub s3_web: WebConfig, + + /// Configuration for the admin API endpoint + pub admin_api: AdminConfig, } /// Configuration for S3 api @@ -89,6 +92,13 @@ pub struct WebConfig { pub root_domain: String, } +/// Configuration for the admin and monitoring HTTP API +#[derive(Deserialize, Debug, Clone)] +pub struct AdminConfig { + /// Address and port to bind for admin API serving + pub bind_addr: SocketAddr, +} + fn default_sled_cache_capacity() -> u64 { 128 * 1024 * 1024 }