BotTerre/libs/clickhouse_pool/src/pool.rs

655 lines
20 KiB
Rust

use std::fmt;
use std::ops::{Deref, DerefMut};
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use std::sync::Arc;
use std::time::{Duration, Instant};
use clickhouse::Client;
use deadpool::managed::{Manager, Metrics, Object, PoolError, RecycleError};
use thiserror::Error;
use tokio::task;
use tokio::time::timeout;
use crate::config::{ClickhouseConfig, DatalakeConfig};
use crate::metrics::{Kind, MetricConfig, Registry, SharedRegistrar};
#[derive(Debug, Error)]
pub enum ClickhouseError {
#[error("Clickhouse client error: {0}")]
Client(#[from] clickhouse::error::Error),
#[error("Connection validation failed: {0}")]
Validation(String),
#[error("Connection timed out")]
Timeout,
#[error("Pool error: {0}")]
Pool(String),
#[error("Shutdown in progress")]
ShuttingDown,
#[error("Batch insertion error: {0}")]
BatchInsertionError(String),
}
impl From<tokio::time::error::Elapsed> for ClickhouseError {
fn from(_: tokio::time::error::Elapsed) -> Self {
Self::Timeout
}
}
impl<T: std::fmt::Display> From<PoolError<T>> for ClickhouseError {
fn from(value: PoolError<T>) -> Self {
Self::Pool(value.to_string())
}
}
#[derive(Debug, Clone)]
pub struct PoolMetrics {
pub size: usize,
pub available: usize,
pub in_use: usize,
pub max_size: usize,
pub min_size: usize,
pub waiters: usize,
}
pub struct ClickhouseConnection {
client: Client,
last_used: Instant,
id: u64,
query_count: AtomicU64,
created_at: Instant,
}
impl ClickhouseConnection {
pub fn new(client: Client, id: u64) -> Self {
Self {
client,
last_used: Instant::now(),
id,
query_count: AtomicU64::new(0),
created_at: Instant::now(),
}
}
pub fn id(&self) -> u64 {
self.id
}
pub fn age(&self) -> Duration {
self.created_at.elapsed()
}
pub fn idle_time(&self) -> Duration {
self.last_used.elapsed()
}
pub fn query_count(&self) -> u64 {
self.query_count.load(Ordering::Relaxed)
}
pub async fn health_check(&self) -> Result<(), ClickhouseError> {
match self.client.query("SELECT 1").execute().await {
Ok(_) => Ok(()),
Err(e) => Err(ClickhouseError::Client(e)),
}
}
}
impl Deref for ClickhouseConnection {
type Target = Client;
fn deref(&self) -> &Self::Target {
&self.client
}
}
impl DerefMut for ClickhouseConnection {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.client
}
}
impl fmt::Debug for ClickhouseConnection {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("ClickhouseConnection")
.field("id", &self.id)
.field("created_at", &self.created_at)
.field("query_count", &self.query_count)
.field("last_used", &self.last_used)
.finish()
}
}
pub fn get_query_type(query: &str) -> &'static str {
let query = query.trim_start().to_uppercase();
if query.starts_with("SELECT") {
"select"
} else if query.starts_with("INSERT") {
"insert"
} else if query.starts_with("CREATE") {
"create"
} else if query.starts_with("ALTER") {
"alter"
} else if query.starts_with("DROP") {
"drop"
} else {
"other"
}
}
#[derive(Debug)]
pub struct ClickhouseConnectionManager {
config: Arc<ClickhouseConfig>,
next_connection_id: AtomicU64,
is_shutting_down: Arc<AtomicBool>,
metrics: Option<SharedRegistrar>,
}
impl ClickhouseConnectionManager {
pub fn new(config: Arc<ClickhouseConfig>, metrics: Option<SharedRegistrar>) -> Self {
Self {
config,
next_connection_id: AtomicU64::new(1),
is_shutting_down: Arc::new(AtomicBool::new(false)),
metrics,
}
}
pub fn initiate_shutdown(&self) {
self.is_shutting_down.store(true, Ordering::SeqCst);
log::info!("Clickhouse connection manager shutdown in progress");
}
pub fn create_client(&self) -> Result<Client, ClickhouseError> {
let url = self.config.authenticated_connection_url();
let client = Client::default()
.with_url(url)
.with_user(&self.config.username)
.with_password(&self.config.password)
.with_option("async_insert", "1")
.with_option("wait_for_async_insert", "1");
Ok(client)
}
}
impl Manager for ClickhouseConnectionManager {
type Type = ClickhouseConnection;
type Error = ClickhouseError;
async fn create(&self) -> Result<Self::Type, Self::Error> {
if self.is_shutting_down.load(Ordering::SeqCst) {
return Err(ClickhouseError::ShuttingDown);
}
let connection_id = self.next_connection_id.fetch_add(1, Ordering::SeqCst);
let start = Instant::now();
let config = &self.config.clone();
log::debug!(
"Creating new Clickhouse connection [id: {}] to: {}:{}",
connection_id,
config.host,
config.port
);
let client = self.create_client()?;
let validation_timeout = Duration::from_secs(config.connect_timeout_seconds);
let validation = match timeout(validation_timeout, client.query("SELECT 1").execute()).await
{
Ok(Ok(_)) => Ok(()),
Ok(Err(e)) => Err(ClickhouseError::Client(e)),
Err(_) => Err(ClickhouseError::Timeout),
};
let duration = start.elapsed();
if let Some(metrics) = &self.metrics {
metrics.set_gauge_vec_mut(
"clickhouse_connection_creation_second",
&["create"],
duration.as_secs_f64(),
);
}
match validation {
Ok(()) => {
log::debug!(
"Connection established: [id: {}] in {:?}",
connection_id,
duration
);
if let Some(metrics) = &self.metrics {
metrics.inc_int_counter_vec_mut(
"clickhouse_connections_created_total",
&["success"],
);
}
Ok(ClickhouseConnection::new(client, connection_id))
}
Err(e) => {
log::error!(
"Failed to validate ClickHouse connection (id: {}): {}",
connection_id,
e
);
if let Some(metrics) = &self.metrics {
metrics.inc_int_counter_vec_mut(
"clickhouse_connections_created_total",
&["failure"],
);
}
Err(e)
}
}
}
async fn recycle(
&self,
conn: &mut Self::Type,
_: &Metrics,
) -> Result<(), RecycleError<Self::Error>> {
if self.is_shutting_down.load(Ordering::SeqCst) {
return Err(RecycleError::Message("Shutting down".into()));
}
log::debug!("Testing health of connection: [id: {}]", conn.id());
let validation_timeout = Duration::from_secs(self.config.connect_timeout_seconds);
match timeout(validation_timeout, conn.query("SELECT 1").execute()).await {
Ok(Ok(_)) => {
log::debug!("Connection [id: {}] health check passed", conn.id());
if let Some(metrics) = &self.metrics {
metrics.inc_int_counter_vec_mut(
"clickhouse_connections_health_checks_total",
&["success"],
);
}
Ok(())
}
Ok(Err(e)) => {
log::warn!("Connection [id: {}] health check failed: {}", conn.id(), e);
if let Some(metrics) = &self.metrics {
metrics.inc_int_counter_vec_mut(
"clickhouse_connections_health_checks_total",
&["failure"],
);
}
Err(RecycleError::Message(
format!("Health check failed: {}", e).into(),
))
}
Err(_) => {
log::warn!(
"Connection [id: {}] health check timed out after: {:?}",
conn.id(),
validation_timeout
);
if let Some(metrics) = &self.metrics {
metrics.inc_int_counter_vec_mut(
"clickhouse_connections_health_checks_total",
&["timeout"],
);
}
Err(RecycleError::Message("Health check timed out".into()))
}
}
}
}
pub type Pool = deadpool::managed::Pool<ClickhouseConnectionManager>;
pub type PooledConnection = Object<ClickhouseConnectionManager>;
pub struct ClickhouseConnectionPool {
pool: Pool,
config: Arc<DatalakeConfig>,
metrics: Option<SharedRegistrar>,
is_initialized: AtomicBool,
}
impl ClickhouseConnectionPool {
pub fn new(config: Arc<DatalakeConfig>, metrics: Option<SharedRegistrar>) -> Self {
if let Some(metrics_ref) = &metrics {
Self::register_metrics(metrics_ref);
}
let initial_size = config.clickhouse.max_connections as usize;
let manager = ClickhouseConnectionManager::new(config.clickhouse.clone(), metrics.clone());
let pool = deadpool::managed::Pool::<ClickhouseConnectionManager>::builder(manager)
.max_size(initial_size)
.build()
.expect("Failed to build connection pool");
Self {
pool,
config,
metrics,
is_initialized: AtomicBool::new(false),
}
}
pub async fn initialize(&self) -> Result<(), ClickhouseError> {
if self.is_initialized.load(Ordering::SeqCst) {
return Ok(());
}
log::info!("Initializing Clickhouse connection pool");
let warmup_count = self.config.clickhouse.max_connections as usize;
let mut warmup_handles = Vec::with_capacity(warmup_count);
for i in 0..warmup_count {
let pool = self.pool.clone();
let handle = task::spawn(async move {
match pool.get().await {
Ok(conn) => match conn.health_check().await {
Ok(_) => {
log::debug!("Warm-up connection {} initialized successfully", i);
Ok(())
}
Err(e) => {
log::error!("Warm-up connection {} health check failed: {}", i, e);
Err(e)
}
},
Err(e) => {
log::error!("Failed to get warm-up connection {}: {}", i, e);
Err(ClickhouseError::Pool(e.to_string()))
}
}
});
warmup_handles.push(handle);
}
let mut warmup_success_count = 0;
for (i, handle) in warmup_handles.into_iter().enumerate() {
match handle.await {
Ok(Ok(_)) => {
warmup_success_count += 1;
}
Ok(Err(e)) => {
log::warn!("Warm-up connection {} failed: {}", i, e);
}
Err(e) => {
log::error!("Warm-up task {} panicked: {}", i, e);
}
}
}
log::info!(
"Connection pool warm-up complete: {}/{} successful",
warmup_success_count,
warmup_count
);
self.is_initialized.store(true, Ordering::SeqCst);
if let Some(metrics) = &self.metrics {
let status = self.pool.status();
metrics.set_int_gauge_vec_mut(
"clickhouse_pool_connections",
&["available"],
status.available as i64,
);
metrics.set_int_gauge_vec_mut(
"clickhouse_pool_connections",
&["size"],
status.size as i64,
);
}
Ok(())
}
pub async fn get_connection(&self) -> Result<PooledConnection, ClickhouseError> {
if !self.is_initialized.load(Ordering::SeqCst) {
log::warn!("Attempting to get connection from uninitialized pool");
}
let start = Instant::now();
let timeout_duration = Duration::from_secs(self.config.clickhouse.connect_timeout_seconds);
for attempt in 0..3 {
match tokio::time::timeout(timeout_duration, self.pool.get()).await {
Ok(Ok(conn)) => {
let duration = start.elapsed();
if let Some(metrics) = &self.metrics {
metrics.set_gauge_vec_mut(
"clickhouse_connection_acquisition_seconds",
&["success"],
duration.as_secs_f64(),
);
metrics.inc_int_counter_vec_mut(
"clickhouse_connection_acquisition_total",
&["success"],
);
}
log::debug!(
"Connection acquired in {:?} (attempt {})",
duration,
attempt + 1
);
return Ok(conn);
}
Ok(Err(e)) => {
if let Some(metrics) = &self.metrics {
metrics.inc_int_counter_vec_mut(
"clickhouse_connection_acquisition_total",
&["failure"],
);
}
log::warn!(
"Failed to get connection from pool (attempt {}): {}",
attempt + 1,
e
);
if attempt >= 2 {
return Err(ClickhouseError::Pool(e.to_string()));
}
}
Err(_) => {
if let Some(metrics) = &self.metrics {
metrics.inc_int_counter_vec_mut(
"clickhouse_connection_acquisition_total",
&["timeout"],
);
}
log::warn!(
"Timed out waiting for connection (attempt {}) after {:?}",
attempt + 1,
timeout_duration
);
if attempt >= 2 {
return Err(ClickhouseError::Timeout);
}
}
}
let backoff = Duration::from_millis(50 * 2u64.pow(attempt));
tokio::time::sleep(backoff).await;
}
Err(ClickhouseError::Pool(
"Failed to get connection after retries".to_string(),
))
}
pub async fn shutdown(&self) -> Result<(), ClickhouseError> {
log::info!("Initiating graceful shutdown of ClickHouse connection pool");
let pool_manager = self.pool.manager();
pool_manager.initiate_shutdown();
let status = self.pool.status();
log::info!(
"Connection pool status before shutdown: size={}, available={}, in_use={}",
status.size,
status.available,
status.size - status.available
);
let drain_timeout = Duration::from_secs(30);
let drain_start = Instant::now();
loop {
let status = self.pool.status();
let in_use = status.size - status.available;
if in_use == 0 {
log::info!("All connections returned to pool, proceeding with shutdown");
break;
}
if drain_start.elapsed() > drain_timeout {
log::warn!(
"Shutdown drain timeout exceeded, {} connections still in use",
in_use
);
break;
}
log::info!("Waiting for {} connections to be returned to pool", in_use);
tokio::time::sleep(Duration::from_secs(1)).await;
}
// Close all connections
self.pool.close();
log::info!("All connections closed");
log::info!("ClickHouse connection pool shutdown complete");
Ok(())
}
fn register_metrics(metrics: &SharedRegistrar) {
let metric_configs = [
// Connection
MetricConfig {
kind: Kind::IntCounterVec,
name: "clickhouse_connections_created_total",
help: "Total no. of connections created",
label_names: &["status"],
},
MetricConfig {
kind: Kind::IntGaugeVec,
name: "clickhouse_pool_connections",
help: "Current no. of connections in the pool",
label_names: &["state"],
},
MetricConfig {
kind: Kind::IntCounterVec,
name: "clickhouse_connetion_health_checks_total",
help: "Total no. of connection health checks",
label_names: &["status"],
},
MetricConfig {
kind: Kind::GaugeVec,
name: "clickhouse_connection_creation_seconds",
help: "Time taken to create connections",
label_names: &["operation"],
},
// Queries
MetricConfig {
kind: Kind::IntCounterVec,
name: "clickhouse_queries_total",
help: "Total no. of queries executed",
label_names: &["type"],
},
MetricConfig {
kind: Kind::IntCounterVec,
name: "clickhouse_query_errors_total",
help: "Total no. of query errors",
label_names: &["type"],
},
MetricConfig {
kind: Kind::GaugeVec,
name: "clickhouse_query_duration_seconds",
help: "Query execution time in seconds",
label_names: &["type"],
},
// Batch queries
MetricConfig {
kind: Kind::IntCounterVec,
name: "clickhouse_batch_query_errors_total",
help: "Total number of batch query errors",
label_names: &["type"],
},
MetricConfig {
kind: Kind::GaugeVec,
name: "clickhouse_batch_query_duration_seconds",
help: "Batch query execution time in seconds",
label_names: &["type"],
},
// Connection acquisition
MetricConfig {
kind: Kind::GaugeVec,
name: "clickhouse_connection_acquisition_seconds",
help: "Time taken to acquire a connection from the pool",
label_names: &["status"],
},
MetricConfig {
kind: Kind::IntCounterVec,
name: "clickhouse_connection_acquisition_total",
help: "Total number of connection acquisition attempts",
label_names: &["status"],
},
MetricConfig {
kind: Kind::IntCounterVec,
name: "clickhouse_connections_recycled_total",
help: "Total number of connections recycled",
label_names: &["reason"],
},
MetricConfig {
kind: Kind::GaugeVec,
name: "clickhouse_connection_recycling_seconds",
help: "Time taken for connection recycling",
label_names: &["operation"],
},
];
metrics.with_metric_configs(&metric_configs).ok();
}
pub fn status(&self) -> PoolMetrics {
let status = self.pool.status();
PoolMetrics {
size: status.size,
available: status.available,
in_use: status.size - status.available,
max_size: status.max_size,
min_size: status.max_size,
waiters: status.waiting,
}
}
}