fix(git/hook): address review findings — fs blocking, redis timeout, backoff, slog

- sync/mod.rs: wrap scan_skills_from_dir in spawn_blocking to avoid
  blocking the async executor; use to_path_buf() to get owned PathBuf
- pool/worker.rs: replace 500ms poll sleep with cancellation-is_cancelled
  check (eliminates artificial latency); add exponential backoff on Redis
  errors (1s base, 32s cap, reset on success)
- pool/redis.rs: add 5s timeout on pool.get() for all three methods
  (next, ack_raw, nak_with_retry) to prevent indefinite blocking on
  unresponsive Redis
- sync/gc.rs: add comment explaining why git gc --auto non-zero exit
  is benign
- webhook_dispatch.rs: remove unnecessary format! wrappers in slog macros
- config/hook.rs: document max_concurrent intent (K8s operator/HPA, not
  the single-threaded worker itself)
This commit is contained in:
ZhenYi 2026-04-17 13:20:31 +08:00
parent ef61b193c4
commit 1fed9fc8ab
6 changed files with 53 additions and 60 deletions

View File

@ -3,6 +3,8 @@ use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PoolConfig { pub struct PoolConfig {
/// Intended concurrency (used by K8s operator/HPA, not the worker itself).
/// The worker is single-threaded by design; K8s replicas provide parallelism.
pub max_concurrent: usize, pub max_concurrent: usize,
pub cpu_threshold: f32, pub cpu_threshold: f32,
/// Hash-tag-prefixed Redis key prefix for hook task queues. /// Hash-tag-prefixed Redis key prefix for hook task queues.

View File

@ -2,6 +2,7 @@ use crate::error::GitError;
use crate::hook::pool::types::HookTask; use crate::hook::pool::types::HookTask;
use deadpool_redis::cluster::Connection as RedisConn; use deadpool_redis::cluster::Connection as RedisConn;
use slog::Logger; use slog::Logger;
use std::time::Duration;
/// Redis List consumer using BLMOVE for atomic move-from-queue-to-work pattern. /// Redis List consumer using BLMOVE for atomic move-from-queue-to-work pattern.
pub struct RedisConsumer { pub struct RedisConsumer {
@ -12,6 +13,8 @@ pub struct RedisConsumer {
logger: Logger, logger: Logger,
} }
const POOL_GET_TIMEOUT: Duration = Duration::from_secs(5);
impl RedisConsumer { impl RedisConsumer {
pub fn new( pub fn new(
pool: deadpool_redis::cluster::Pool, pool: deadpool_redis::cluster::Pool,
@ -36,10 +39,9 @@ impl RedisConsumer {
let queue_key = format!("{}:{}", self.prefix, task_type); let queue_key = format!("{}:{}", self.prefix, task_type);
let work_key = format!("{}:{}:work", self.prefix, task_type); let work_key = format!("{}:{}:work", self.prefix, task_type);
let redis = self let redis = tokio::time::timeout(POOL_GET_TIMEOUT, self.pool.get())
.pool
.get()
.await .await
.map_err(|_| GitError::Internal("redis pool get timed out".into()))?
.map_err(|e| GitError::Internal(format!("redis pool get failed: {}", e)))?; .map_err(|e| GitError::Internal(format!("redis pool get failed: {}", e)))?;
let mut conn: RedisConn = redis; let mut conn: RedisConn = redis;
@ -90,10 +92,9 @@ impl RedisConsumer {
} }
async fn ack_raw(&self, work_key: &str, task_json: &str) -> Result<(), GitError> { async fn ack_raw(&self, work_key: &str, task_json: &str) -> Result<(), GitError> {
let redis = self let redis = tokio::time::timeout(POOL_GET_TIMEOUT, self.pool.get())
.pool
.get()
.await .await
.map_err(|_| GitError::Internal("redis pool get timed out".into()))?
.map_err(|e| GitError::Internal(format!("redis pool get failed: {}", e)))?; .map_err(|e| GitError::Internal(format!("redis pool get failed: {}", e)))?;
let mut conn: RedisConn = redis; let mut conn: RedisConn = redis;
@ -129,10 +130,9 @@ impl RedisConsumer {
old_task_json: &str, old_task_json: &str,
new_task_json: &str, new_task_json: &str,
) -> Result<(), GitError> { ) -> Result<(), GitError> {
let redis = self let redis = tokio::time::timeout(POOL_GET_TIMEOUT, self.pool.get())
.pool
.get()
.await .await
.map_err(|_| GitError::Internal("redis pool get timed out".into()))?
.map_err(|e| GitError::Internal(format!("redis pool get failed: {}", e)))?; .map_err(|e| GitError::Internal(format!("redis pool get failed: {}", e)))?;
let mut conn: RedisConn = redis; let mut conn: RedisConn = redis;

View File

@ -44,26 +44,30 @@ impl HookWorker {
slog::info!(self.logger, "hook worker started"); slog::info!(self.logger, "hook worker started");
let task_types = [TaskType::Sync, TaskType::Fsck, TaskType::Gc]; let task_types = [TaskType::Sync, TaskType::Fsck, TaskType::Gc];
let poll_interval = Duration::from_millis(500); let mut redis_backoff_ms: u64 = 1000;
loop { loop {
tokio::select! { // Check cancellation at top of loop to avoid unnecessary work
_ = cancel.cancelled() => { if cancel.is_cancelled() {
slog::info!(self.logger, "hook worker shutdown signal received"); slog::info!(self.logger, "hook worker shutdown signal received");
break; break;
}
_ = tokio::time::sleep(poll_interval) => {}
} }
for task_type in &task_types { for task_type in &task_types {
let result = self.consumer.next(&task_type.to_string()).await; let result = self.consumer.next(&task_type.to_string()).await;
let (task, task_json) = match result { let (task, task_json) = match result {
Ok(Some(pair)) => pair, Ok(Some(pair)) => {
// Reset backoff on successful dequeue
redis_backoff_ms = 1000;
pair
}
Ok(None) => continue, Ok(None) => continue,
Err(e) => { Err(e) => {
slog::warn!(self.logger, "failed to dequeue task: {}", e); slog::warn!(self.logger, "failed to dequeue task: {}", e);
tokio::time::sleep(Duration::from_secs(1)).await; tokio::time::sleep(Duration::from_millis(redis_backoff_ms)).await;
// Exponential backoff, cap at 32s
redis_backoff_ms = (redis_backoff_ms * 2).min(32_000);
break; break;
} }
}; };
@ -338,20 +342,8 @@ impl HookWorker {
let db = self.db.clone(); let db = self.db.clone();
let cache = self.cache.clone(); let cache = self.cache.clone();
let logger = self.logger.clone(); let logger = self.logger.clone();
let sync = HookMetaDataSync::new(db, cache, repo, logger)?;
tokio::task::spawn_blocking(move || { sync.fsck_only().await?;
let result = tokio::runtime::Handle::current().block_on(async {
let sync = HookMetaDataSync::new(db.clone(), cache.clone(), repo.clone(), logger.clone())?;
sync.fsck_only().await
});
match result {
Ok(()) => Ok::<(), GitError>(()),
Err(e) => Err(GitError::Internal(e.to_string())),
}
})
.await
.map_err(|e| GitError::Internal(format!("spawn_blocking join error: {}", e)))?
.map_err(GitError::from)?;
Ok(()) Ok(())
} }
@ -376,20 +368,8 @@ impl HookWorker {
let db = self.db.clone(); let db = self.db.clone();
let cache = self.cache.clone(); let cache = self.cache.clone();
let logger = self.logger.clone(); let logger = self.logger.clone();
let sync = HookMetaDataSync::new(db, cache, repo, logger)?;
tokio::task::spawn_blocking(move || { sync.gc_only().await?;
let result = tokio::runtime::Handle::current().block_on(async {
let sync = HookMetaDataSync::new(db.clone(), cache.clone(), repo.clone(), logger.clone())?;
sync.gc_only().await
});
match result {
Ok(()) => Ok::<(), GitError>(()),
Err(e) => Err(GitError::Internal(e.to_string())),
}
})
.await
.map_err(|e| GitError::Internal(format!("spawn_blocking join error: {}", e)))?
.map_err(GitError::from)?;
Ok(()) Ok(())
} }

View File

@ -18,6 +18,8 @@ impl HookMetaDataSync {
.map_err(|e| GitError::IoError(format!("git gc failed: {}", e)))?; .map_err(|e| GitError::IoError(format!("git gc failed: {}", e)))?;
if !status.success() { if !status.success() {
// git gc --auto exits non-zero when there's nothing to collect,
// or when another gc is already running — both are benign.
slog::warn!(logger, "git gc exited with {:?}", status.code()); slog::warn!(logger, "git gc exited with {:?}", status.code());
} }

View File

@ -283,7 +283,7 @@ impl HookMetaDataSync {
let project_uid = self.repo.project; let project_uid = self.repo.project;
let repo_root = match self.domain.repo().workdir() { let repo_root = match self.domain.repo().workdir() {
Some(p) => p, Some(p) => p.to_path_buf(),
None => return, None => return,
}; };
@ -296,12 +296,21 @@ impl HookMetaDataSync {
.map(|oid| oid.to_string()) .map(|oid| oid.to_string())
.unwrap_or_default(); .unwrap_or_default();
let discovered = match scan_skills_from_dir(repo_root, &self.repo.id, &commit_sha) { let repo_id = self.repo.id;
Ok(d) => d, let discovered = match tokio::task::spawn_blocking(move || {
Err(e) => { scan_skills_from_dir(&repo_root, &repo_id, &commit_sha)
})
.await
{
Ok(Ok(d)) => d,
Ok(Err(e)) => {
slog::warn!(self.logger, "failed to scan skills directory: {}", e); slog::warn!(self.logger, "failed to scan skills directory: {}", e);
return; return;
} }
Err(e) => {
slog::warn!(self.logger, "spawn_blocking join error: {}", e);
return;
}
}; };
if discovered.is_empty() { if discovered.is_empty() {

View File

@ -231,7 +231,7 @@ pub async fn dispatch_repo_webhooks(
{ {
Ok(ws) => ws, Ok(ws) => ws,
Err(e) => { Err(e) => {
slog::error!(logs, "{}", format!("failed to query webhooks repo={} error={}", repo_uuid, e)); slog::error!(logs, "failed to query webhooks repo={} error={}", repo_uuid, e);
return; return;
} }
}; };
@ -297,7 +297,7 @@ pub async fn dispatch_repo_webhooks(
let body = match serde_json::to_vec(&payload) { let body = match serde_json::to_vec(&payload) {
Ok(b) => b, Ok(b) => b,
Err(e) => { Err(e) => {
slog::error!(logs, "{}", format!("failed to serialize push payload error={}", e)); slog::error!(logs, "failed to serialize push payload error={}", e);
continue; continue;
} }
}; };
@ -310,15 +310,15 @@ pub async fn dispatch_repo_webhooks(
.await .await
{ {
Ok(Ok(())) => { Ok(Ok(())) => {
slog::info!(logs, "{}", format!("push webhook delivered webhook_id={} url={}", webhook_id, url)); slog::info!(logs, "push webhook delivered webhook_id={} url={}", webhook_id, url);
let _ = touch_webhook(db, webhook_id, true, logs).await; let _ = touch_webhook(db, webhook_id, true, logs).await;
} }
Ok(Err(e)) => { Ok(Err(e)) => {
slog::warn!(logs, "{}", format!("push webhook delivery failed webhook_id={} url={} error={}", webhook_id, url, e)); slog::warn!(logs, "push webhook delivery failed webhook_id={} url={} error={}", webhook_id, url, e);
let _ = touch_webhook(db, webhook_id, false, logs).await; let _ = touch_webhook(db, webhook_id, false, logs).await;
} }
Err(_) => { Err(_) => {
slog::warn!(logs, "{}", format!("push webhook timed out webhook_id={} url={}", webhook_id, url)); slog::warn!(logs, "push webhook timed out webhook_id={} url={}", webhook_id, url);
let _ = touch_webhook(db, webhook_id, false, logs).await; let _ = touch_webhook(db, webhook_id, false, logs).await;
} }
} }
@ -351,7 +351,7 @@ pub async fn dispatch_repo_webhooks(
let body = match serde_json::to_vec(&payload) { let body = match serde_json::to_vec(&payload) {
Ok(b) => b, Ok(b) => b,
Err(e) => { Err(e) => {
slog::error!(logs, "{}", format!("failed to serialize tag payload error={}", e)); slog::error!(logs, "failed to serialize tag payload error={}", e);
continue; continue;
} }
}; };
@ -364,15 +364,15 @@ pub async fn dispatch_repo_webhooks(
.await .await
{ {
Ok(Ok(())) => { Ok(Ok(())) => {
slog::info!(logs, "{}", format!("tag webhook delivered webhook_id={} url={}", webhook_id, url)); slog::info!(logs, "tag webhook delivered webhook_id={} url={}", webhook_id, url);
let _ = touch_webhook(db, webhook_id, true, logs).await; let _ = touch_webhook(db, webhook_id, true, logs).await;
} }
Ok(Err(e)) => { Ok(Err(e)) => {
slog::warn!(logs, "{}", format!("tag webhook delivery failed webhook_id={} url={} error={}", webhook_id, url, e)); slog::warn!(logs, "tag webhook delivery failed webhook_id={} url={} error={}", webhook_id, url, e);
let _ = touch_webhook(db, webhook_id, false, logs).await; let _ = touch_webhook(db, webhook_id, false, logs).await;
} }
Err(_) => { Err(_) => {
slog::warn!(logs, "{}", format!("tag webhook timed out webhook_id={} url={}", webhook_id, url)); slog::warn!(logs, "tag webhook timed out webhook_id={} url={}", webhook_id, url);
let _ = touch_webhook(db, webhook_id, false, logs).await; let _ = touch_webhook(db, webhook_id, false, logs).await;
} }
} }
@ -405,6 +405,6 @@ async fn touch_webhook(db: &AppDatabase, webhook_id: i64, success: bool, logs: &
}; };
if let Err(e) = result { if let Err(e) = result {
slog::warn!(logs, "{}", format!("failed to update webhook touch error={}", e)); slog::warn!(logs, "failed to update webhook touch error={}", e);
} }
} }