feat(git): add Redis-backed hook worker with per-repo distributed locking

- pool/worker.rs: single-threaded consumer that BLMPOPs from Redis queues
  sequentially. K8s replicas provide HA — each pod runs one worker.
- pool/redis.rs: RedisConsumer with BLMOVE atomic dequeue, ACK/NAK, and
  retry-with-json support.
- pool/types.rs: HookTask, TaskType, PoolConfig (minimal — no pool metrics).
- sync/lock.rs: Redis SET NX EX per-repo lock to prevent concurrent workers
  from processing the same repo. Lock conflicts are handled by requeueing
  without incrementing retry count.
- hook/mod.rs: HookService.start_worker() spawns the background worker.
- ssh/mod.rs / http/mod.rs: ReceiveSyncService RPUSHes to Redis queue.
  Both run_http and run_ssh call start_worker() to launch the consumer.
- Lock conflicts (GitError::Locked) in the worker are requeued without
  incrementing retry_count so another worker can pick them up.
This commit is contained in:
ZhenYi 2026-04-17 12:33:58 +08:00
parent eeb99bf628
commit 8fb2436f22
11 changed files with 820 additions and 176 deletions

View File

@ -2,13 +2,20 @@ use config::AppConfig;
use db::cache::AppCache; use db::cache::AppCache;
use db::database::AppDatabase; use db::database::AppDatabase;
use deadpool_redis::cluster::Pool as RedisPool; use deadpool_redis::cluster::Pool as RedisPool;
use models::EntityTrait;
use slog::Logger; use slog::Logger;
use std::sync::Arc; use std::sync::Arc;
use tokio_util::sync::CancellationToken;
/// Simplified hook service — no queue, no pool. pub mod pool;
/// K8s StatefulSet HA scheduling ensures only one pod touches a repo at a time. pub mod sync;
/// Execution is direct and sequential per invocation. pub mod webhook_dispatch;
pub use pool::{HookWorker, PoolConfig, RedisConsumer};
pub use pool::types::{HookTask, TaskType};
/// Hook service that manages the Redis-backed task queue worker.
/// Multiple gitserver pods can run concurrently — the worker acquires a
/// per-repo Redis lock before processing each task.
#[derive(Clone)] #[derive(Clone)]
pub struct HookService { pub struct HookService {
pub(crate) db: AppDatabase, pub(crate) db: AppDatabase,
@ -38,92 +45,15 @@ impl HookService {
} }
} }
/// Full sync: refs → commits → tags → LFS → fsck → gc → skills. /// Start the background worker and return a cancellation token.
pub async fn sync_repo(&self, repo_id: &str) -> Result<(), crate::GitError> { pub fn start_worker(&self) -> CancellationToken {
let repo_uuid = models::Uuid::parse_str(repo_id) let pool_config = PoolConfig::from_env(&self.config);
.map_err(|_| crate::GitError::Internal("invalid repo_id uuid".into()))?; pool::start_worker(
let repo = models::repos::repo::Entity::find_by_id(repo_uuid)
.one(self.db.reader())
.await
.map_err(crate::GitError::from)?
.ok_or_else(|| crate::GitError::NotFound(format!("repo {} not found", repo_id)))?;
if !std::path::Path::new(&repo.storage_path).exists() {
return Err(crate::GitError::NotFound(format!(
"storage path does not exist: {}",
repo.storage_path
)));
}
let sync = crate::hook::sync::HookMetaDataSync::new(
self.db.clone(), self.db.clone(),
self.cache.clone(), self.cache.clone(),
repo, self.redis_pool.clone(),
self.logger.clone(), self.logger.clone(),
)?; pool_config,
)
// No distributed lock needed — K8s StatefulSet scheduling guarantees
// that at most one pod processes a given repo shard at any time.
sync.sync().await
}
/// Run fsck only (no full sync).
pub async fn fsck_repo(&self, repo_id: &str) -> Result<(), crate::GitError> {
let repo_uuid = models::Uuid::parse_str(repo_id)
.map_err(|_| crate::GitError::Internal("invalid repo_id uuid".into()))?;
let repo = models::repos::repo::Entity::find_by_id(repo_uuid)
.one(self.db.reader())
.await
.map_err(crate::GitError::from)?
.ok_or_else(|| crate::GitError::NotFound(format!("repo {} not found", repo_id)))?;
if !std::path::Path::new(&repo.storage_path).exists() {
return Err(crate::GitError::NotFound(format!(
"storage path does not exist: {}",
repo.storage_path
)));
}
let sync = crate::hook::sync::HookMetaDataSync::new(
self.db.clone(),
self.cache.clone(),
repo,
self.logger.clone(),
)?;
sync.fsck_only().await
}
/// Run gc only (no full sync).
pub async fn gc_repo(&self, repo_id: &str) -> Result<(), crate::GitError> {
let repo_uuid = models::Uuid::parse_str(repo_id)
.map_err(|_| crate::GitError::Internal("invalid repo_id uuid".into()))?;
let repo = models::repos::repo::Entity::find_by_id(repo_uuid)
.one(self.db.reader())
.await
.map_err(crate::GitError::from)?
.ok_or_else(|| crate::GitError::NotFound(format!("repo {} not found", repo_id)))?;
if !std::path::Path::new(&repo.storage_path).exists() {
return Err(crate::GitError::NotFound(format!(
"storage path does not exist: {}",
repo.storage_path
)));
}
let sync = crate::hook::sync::HookMetaDataSync::new(
self.db.clone(),
self.cache.clone(),
repo,
self.logger.clone(),
)?;
sync.gc_only().await
} }
} }
pub mod sync;
pub mod webhook_dispatch;

41
libs/git/hook/pool/mod.rs Normal file
View File

@ -0,0 +1,41 @@
pub mod redis;
pub mod types;
pub mod worker;
pub use redis::RedisConsumer;
pub use types::{HookTask, PoolConfig, TaskType};
pub use worker::HookWorker;
use db::cache::AppCache;
use db::database::AppDatabase;
use deadpool_redis::cluster::Pool as RedisPool;
use slog::Logger;
use tokio_util::sync::CancellationToken;
/// Start the hook worker background task.
/// Returns a handle to the cancellation token so the caller can shut it down.
pub fn start_worker(
db: AppDatabase,
cache: AppCache,
redis_pool: RedisPool,
logger: Logger,
config: PoolConfig,
) -> CancellationToken {
let consumer = RedisConsumer::new(
redis_pool.clone(),
config.redis_list_prefix.clone(),
config.redis_block_timeout_secs,
logger.clone(),
);
let worker = HookWorker::new(db, cache, logger, consumer);
let cancel = CancellationToken::new();
let cancel_clone = cancel.clone();
tokio::spawn(async move {
worker.run(cancel_clone).await;
});
cancel
}

167
libs/git/hook/pool/redis.rs Normal file
View File

@ -0,0 +1,167 @@
use crate::error::GitError;
use crate::hook::pool::types::HookTask;
use deadpool_redis::cluster::Connection as RedisConn;
use slog::Logger;
/// Redis List consumer using BLMOVE for atomic move-from-queue-to-work pattern.
pub struct RedisConsumer {
pool: deadpool_redis::cluster::Pool,
/// Hash-tag-prefixed key prefix, e.g. "{hook}".
prefix: String,
block_timeout_secs: u64,
logger: Logger,
}
impl RedisConsumer {
pub fn new(
pool: deadpool_redis::cluster::Pool,
prefix: String,
block_timeout_secs: u64,
logger: Logger,
) -> Self {
Self {
pool,
prefix,
block_timeout_secs,
logger,
}
}
/// Atomically moves a task from the main queue to the work queue using BLMOVE.
/// Blocks up to `block_timeout_secs` waiting for a task.
///
/// Returns `Some((HookTask, task_json))` where `task_json` is the raw JSON string
/// needed for LREM on ACK. Returns `None` if the blocking timed out.
pub async fn next(&self, task_type: &str) -> Result<Option<(HookTask, String)>, GitError> {
let queue_key = format!("{}:{}", self.prefix, task_type);
let work_key = format!("{}:{}:work", self.prefix, task_type);
let redis = self
.pool
.get()
.await
.map_err(|e| GitError::Internal(format!("redis pool get failed: {}", e)))?;
let mut conn: RedisConn = redis;
// BLMOVE source destination <LEFT|RIGHT> <LEFT|RIGHT> timeout
let task_json: Option<String> = redis::cmd("BLMOVE")
.arg(&queue_key)
.arg(&work_key)
.arg("RIGHT")
.arg("LEFT")
.arg(self.block_timeout_secs)
.query_async(&mut conn)
.await
.map_err(|e| GitError::Internal(format!("BLMOVE failed: {}", e)))?;
match task_json {
Some(json) => {
match serde_json::from_str::<HookTask>(&json) {
Ok(task) => {
slog::debug!(self.logger, "task dequeued";
"task_id" => %task.id,
"task_type" => %task.task_type,
"queue" => %queue_key
);
Ok(Some((task, json)))
}
Err(e) => {
// Malformed task — remove from work queue and discard
slog::warn!(self.logger, "malformed task JSON, discarding";
"error" => %e,
"queue" => %work_key
);
let _ = self.ack_raw(&work_key, &json).await;
Ok(None)
}
}
}
None => {
// Timed out, no task available
Ok(None)
}
}
}
/// Acknowledge a task: remove it from the work queue (LREM).
pub async fn ack(&self, work_key: &str, task_json: &str) -> Result<(), GitError> {
self.ack_raw(work_key, task_json).await
}
async fn ack_raw(&self, work_key: &str, task_json: &str) -> Result<(), GitError> {
let redis = self
.pool
.get()
.await
.map_err(|e| GitError::Internal(format!("redis pool get failed: {}", e)))?;
let mut conn: RedisConn = redis;
let _: i64 = redis::cmd("LREM")
.arg(work_key)
.arg(-1)
.arg(task_json)
.query_async(&mut conn)
.await
.map_err(|e| GitError::Internal(format!("LREM failed: {}", e)))?;
Ok(())
}
/// Negative acknowledge (retry): remove from work queue and push back to main queue.
pub async fn nak(
&self,
work_key: &str,
queue_key: &str,
task_json: &str,
) -> Result<(), GitError> {
self.nak_with_retry(work_key, queue_key, task_json, task_json).await
}
/// Negative acknowledge with a different (updated) task JSON — used to
/// requeue with an incremented retry_count.
pub async fn nak_with_retry(
&self,
work_key: &str,
queue_key: &str,
old_task_json: &str,
new_task_json: &str,
) -> Result<(), GitError> {
self.ack_raw(work_key, old_task_json).await?;
let redis = self
.pool
.get()
.await
.map_err(|e| GitError::Internal(format!("redis pool get failed: {}", e)))?;
let mut conn: RedisConn = redis;
let _: i64 = redis::cmd("LPUSH")
.arg(queue_key)
.arg(new_task_json)
.query_async(&mut conn)
.await
.map_err(|e| GitError::Internal(format!("LPUSH retry failed: {}", e)))?;
slog::warn!(self.logger, "task nack'd and requeued queue={}", queue_key);
Ok(())
}
pub fn prefix(&self) -> &str {
&self.prefix
}
}
impl Clone for RedisConsumer {
fn clone(&self) -> Self {
Self {
pool: self.pool.clone(),
prefix: self.prefix.clone(),
block_timeout_secs: self.block_timeout_secs,
logger: self.logger.clone(),
}
}
}

View File

@ -0,0 +1,32 @@
use serde::{Deserialize, Serialize};
pub use config::hook::PoolConfig;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HookTask {
pub id: String,
pub repo_id: String,
pub task_type: TaskType,
pub payload: serde_json::Value,
pub created_at: chrono::DateTime<chrono::Utc>,
#[serde(default)]
pub retry_count: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum TaskType {
Sync,
Fsck,
Gc,
}
impl std::fmt::Display for TaskType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
TaskType::Sync => write!(f, "sync"),
TaskType::Fsck => write!(f, "fsck"),
TaskType::Gc => write!(f, "gc"),
}
}
}

View File

@ -0,0 +1,388 @@
use crate::error::GitError;
use crate::hook::pool::redis::RedisConsumer;
use crate::hook::pool::types::{HookTask, TaskType};
use crate::hook::sync::HookMetaDataSync;
use db::cache::AppCache;
use db::database::AppDatabase;
use models::EntityTrait;
use slog::Logger;
use std::time::Duration;
use tokio_util::sync::CancellationToken;
/// Single-threaded worker that sequentially consumes tasks from Redis queues.
/// K8s can scale replicas for concurrency — each replica runs one worker.
/// Per-repo Redis locking prevents concurrent workers from processing the same repo.
#[derive(Clone)]
pub struct HookWorker {
db: AppDatabase,
cache: AppCache,
logger: Logger,
consumer: RedisConsumer,
}
impl HookWorker {
pub fn new(
db: AppDatabase,
cache: AppCache,
logger: Logger,
consumer: RedisConsumer,
) -> Self {
Self {
db,
cache,
logger,
consumer,
}
}
/// Run the worker loop. Blocks until cancelled.
pub async fn run(&self, cancel: CancellationToken) {
slog::info!(self.logger, "hook worker started");
let task_types = [TaskType::Sync, TaskType::Fsck, TaskType::Gc];
let poll_interval = Duration::from_millis(500);
loop {
tokio::select! {
_ = cancel.cancelled() => {
slog::info!(self.logger, "hook worker shutdown signal received");
break;
}
_ = tokio::time::sleep(poll_interval) => {}
}
for task_type in &task_types {
let result = self
.consumer
.next(&task_type.to_string())
.await;
let (task, task_json) = match result {
Ok(Some(pair)) => pair,
Ok(None) => continue,
Err(e) => {
slog::warn!(self.logger, "failed to dequeue task: {}", e);
tokio::time::sleep(Duration::from_secs(1)).await;
break;
}
};
let queue_key = format!("{}:{}", self.consumer.prefix(), task_type);
let work_key = format!("{}:work", queue_key);
self.process_task(&task, &task_json, &work_key, &queue_key)
.await;
}
}
slog::info!(self.logger, "hook worker stopped");
}
async fn process_task(
&self,
task: &HookTask,
task_json: &str,
work_key: &str,
queue_key: &str,
) {
slog::info!(self.logger, "task started task_id={} task_type={} repo_id={}",
task.id, task.task_type, task.repo_id);
let result = match task.task_type {
TaskType::Sync => self.run_sync(&task.repo_id).await,
TaskType::Fsck => self.run_fsck(&task.repo_id).await,
TaskType::Gc => self.run_gc(&task.repo_id).await,
};
match result {
Ok(()) => {
if let Err(e) = self.consumer.ack(work_key, task_json).await {
slog::warn!(self.logger, "failed to ack task: {}", e);
}
slog::info!(self.logger, "task completed task_id={}", task.id);
}
Err(e) => {
// GitError::Locked means another worker is processing this repo —
// requeue without incrementing retry count so it can be picked up later.
let is_locked = matches!(e, crate::GitError::Locked(_));
if is_locked {
slog::info!(self.logger, "repo locked by another worker, requeueing task_id={}", task.id);
if let Err(nak_err) = self.consumer.nak(work_key, queue_key, task_json).await {
slog::warn!(self.logger, "failed to requeue locked task: {}", nak_err);
}
} else {
slog::warn!(self.logger, "task failed task_id={} task_type={} repo_id={} error={}",
task.id, task.task_type, task.repo_id, e);
const MAX_RETRIES: u32 = 5;
if task.retry_count >= MAX_RETRIES {
slog::warn!(self.logger, "task exhausted retries, discarding task_id={} retry_count={}",
task.id, task.retry_count);
let _ = self.consumer.ack(work_key, task_json).await;
} else {
let mut task = task.clone();
task.retry_count += 1;
let retry_json =
serde_json::to_string(&task).unwrap_or_else(|_| task_json.to_string());
let _ = self
.consumer
.nak_with_retry(work_key, queue_key, task_json, &retry_json)
.await;
}
}
}
}
}
async fn run_sync(&self, repo_id: &str) -> Result<(), GitError> {
let repo_uuid = models::Uuid::parse_str(repo_id)
.map_err(|_| GitError::Internal("invalid repo_id uuid".into()))?;
let repo = models::repos::repo::Entity::find_by_id(repo_uuid)
.one(self.db.reader())
.await
.map_err(GitError::from)?
.ok_or_else(|| GitError::NotFound(format!("repo {} not found", repo_id)))?;
if !std::path::Path::new(&repo.storage_path).exists() {
return Err(GitError::NotFound(format!(
"storage path does not exist: {}",
repo.storage_path
)));
}
// Capture before tips
let before_tips = tokio::task::spawn_blocking({
let db = self.db.clone();
let cache = self.cache.clone();
let logger = self.logger.clone();
let repo = repo.clone();
move || {
let sync = HookMetaDataSync::new(db, cache, repo, logger)
.map_err(|e| GitError::Internal(e.to_string()))?;
Ok::<_, GitError>((sync.list_branch_tips(), sync.list_tag_tips()))
}
})
.await
.map_err(|e| GitError::Internal(format!("spawn_blocking join error: {}", e)))?
.map_err(GitError::from)?;
// Run sync
tokio::task::spawn_blocking({
let db = self.db.clone();
let cache = self.cache.clone();
let logger = self.logger.clone();
let repo = repo.clone();
move || {
let result = tokio::runtime::Handle::current().block_on(async {
let sync = HookMetaDataSync::new(db, cache, repo, logger)?;
sync.sync().await
});
match result {
Ok(()) => Ok::<(), GitError>(()),
Err(e) => Err(GitError::Internal(e.to_string())),
}
}
})
.await
.map_err(|e| GitError::Internal(format!("spawn_blocking join error: {}", e)))?
.map_err(GitError::from)?;
// Capture after tips
let after_tips = tokio::task::spawn_blocking({
let db = self.db.clone();
let cache = self.cache.clone();
let logger = self.logger.clone();
let repo = repo.clone();
move || {
let sync = HookMetaDataSync::new(db, cache, repo, logger)
.map_err(|e| GitError::Internal(e.to_string()))?;
Ok::<_, GitError>((sync.list_branch_tips(), sync.list_tag_tips()))
}
})
.await
.map_err(|e| GitError::Internal(format!("spawn_blocking join error: {}", e)))?
.map_err(GitError::from)?;
let (before_branch_tips, before_tag_tips) = before_tips;
let (after_branch_tips, after_tag_tips) = after_tips;
let project = repo.project;
// Dispatch branch push webhooks
for (branch, after_oid) in after_branch_tips {
let before_oid = before_branch_tips
.iter()
.find(|(n, _)| n == &branch)
.map(|(_, o)| o.as_str());
let changed = before_oid.map(|o| o != after_oid.as_str()).unwrap_or(true);
if changed {
let before_oid = before_oid.map_or("0", |v| v).to_string();
let branch_name = branch.clone();
let db = self.db.clone();
let logger = self.logger.clone();
let repo_id_str = repo.id.to_string();
let repo_name = repo.repo_name.clone();
let default_branch = repo.default_branch.clone();
let ns = models::projects::Project::find_by_id(project)
.one(self.db.reader())
.await
.ok()
.flatten()
.map(|p| p.name)
.unwrap_or_default();
tokio::spawn(async move {
crate::hook::webhook_dispatch::dispatch_repo_webhooks(
&db,
&reqwest::Client::new(),
&logger,
&repo_id_str,
&ns,
&repo_name,
&default_branch,
"",
"",
crate::hook::webhook_dispatch::WebhookEventKind::Push {
r#ref: format!("refs/heads/{}", branch_name),
before: before_oid,
after: after_oid,
commits: vec![],
},
)
.await;
});
}
}
// Dispatch tag push webhooks
for (tag, after_oid) in after_tag_tips {
let before_oid = before_tag_tips
.iter()
.find(|(n, _)| n == &tag)
.map(|(_, o)| o.as_str());
let is_new = before_oid.is_none();
let was_updated = before_oid.map(|o| o != after_oid.as_str()).unwrap_or(false);
if is_new || was_updated {
let before_oid = before_oid.map_or("0", |v| v).to_string();
let tag_name = tag.clone();
let db = self.db.clone();
let logger = self.logger.clone();
let repo_id_str = repo.id.to_string();
let repo_name = repo.repo_name.clone();
let default_branch = repo.default_branch.clone();
let ns = models::projects::Project::find_by_id(project)
.one(self.db.reader())
.await
.ok()
.flatten()
.map(|p| p.name)
.unwrap_or_default();
tokio::spawn(async move {
crate::hook::webhook_dispatch::dispatch_repo_webhooks(
&db,
&reqwest::Client::new(),
&logger,
&repo_id_str,
&ns,
&repo_name,
&default_branch,
"",
"",
crate::hook::webhook_dispatch::WebhookEventKind::TagPush {
r#ref: format!("refs/tags/{}", tag_name),
before: before_oid,
after: after_oid,
},
)
.await;
});
}
}
Ok(())
}
async fn run_fsck(&self, repo_id: &str) -> Result<(), GitError> {
let repo_uuid = models::Uuid::parse_str(repo_id)
.map_err(|_| GitError::Internal("invalid repo_id uuid".into()))?;
let repo = models::repos::repo::Entity::find_by_id(repo_uuid)
.one(self.db.reader())
.await
.map_err(GitError::from)?
.ok_or_else(|| GitError::NotFound(format!("repo {} not found", repo_id)))?;
if !std::path::Path::new(&repo.storage_path).exists() {
return Err(GitError::NotFound(format!(
"storage path does not exist: {}",
repo.storage_path
)));
}
let db = self.db.clone();
let cache = self.cache.clone();
let logger = self.logger.clone();
tokio::task::spawn_blocking({
let repo = repo.clone();
move || {
let result = tokio::runtime::Handle::current().block_on(async {
let sync = HookMetaDataSync::new(db, cache, repo, logger)?;
sync.fsck_only().await
});
match result {
Ok(()) => Ok::<(), GitError>(()),
Err(e) => Err(GitError::Internal(e.to_string())),
}
}
})
.await
.map_err(|e| GitError::Internal(format!("spawn_blocking join error: {}", e)))?
.map_err(GitError::from)?;
Ok(())
}
async fn run_gc(&self, repo_id: &str) -> Result<(), GitError> {
let repo_uuid = models::Uuid::parse_str(repo_id)
.map_err(|_| GitError::Internal("invalid repo_id uuid".into()))?;
let repo = models::repos::repo::Entity::find_by_id(repo_uuid)
.one(self.db.reader())
.await
.map_err(GitError::from)?
.ok_or_else(|| GitError::NotFound(format!("repo {} not found", repo_id)))?;
if !std::path::Path::new(&repo.storage_path).exists() {
return Err(GitError::NotFound(format!(
"storage path does not exist: {}",
repo.storage_path
)));
}
let db = self.db.clone();
let cache = self.cache.clone();
let logger = self.logger.clone();
tokio::task::spawn_blocking({
let repo = repo.clone();
move || {
let result = tokio::runtime::Handle::current().block_on(async {
let sync = HookMetaDataSync::new(db, cache, repo, logger)?;
sync.gc_only().await
});
match result {
Ok(()) => Ok::<(), GitError>(()),
Err(e) => Err(GitError::Internal(e.to_string())),
}
}
})
.await
.map_err(|e| GitError::Internal(format!("spawn_blocking join error: {}", e)))?
.map_err(GitError::from)?;
Ok(())
}
}

View File

@ -0,0 +1,66 @@
use crate::GitError;
use crate::hook::sync::HookMetaDataSync;
impl HookMetaDataSync {
const LOCK_TTL_SECS: u64 = 60;
/// Try to acquire an exclusive lock for this repo.
/// Returns the lock value if acquired, which must be passed to `release_lock`.
pub async fn acquire_lock(&self) -> Result<String, GitError> {
let lock_key = format!("git:repo:lock:{}", self.repo.id);
let lock_value = format!("{}:{}", uuid::Uuid::new_v4(), std::process::id());
let mut conn = self
.cache
.conn()
.await
.map_err(|e| GitError::IoError(format!("failed to get redis connection: {}", e)))?;
let result: bool = redis::cmd("SET")
.arg(&lock_key)
.arg(&lock_value)
.arg("NX")
.arg("EX")
.arg(Self::LOCK_TTL_SECS)
.query_async(&mut conn)
.await
.map_err(|e| GitError::IoError(format!("failed to acquire lock: {}", e)))?;
if result {
Ok(lock_value)
} else {
Err(GitError::Locked(format!(
"repository {} is locked by another process",
self.repo.id
)))
}
}
/// Release the lock, but only if we still own it (value matches).
pub async fn release_lock(&self, lock_value: &str) -> Result<(), GitError> {
let lock_key = format!("git:repo:lock:{}", self.repo.id);
let mut conn = self
.cache
.conn()
.await
.map_err(|e| GitError::IoError(format!("failed to get redis connection: {}", e)))?;
let script = r#"
if redis.call("get", KEYS[1]) == ARGV[1] then
return redis.call("del", KEYS[1])
else
return 0
end
"#;
let _: i32 = redis::Script::new(script)
.key(&lock_key)
.arg(lock_value)
.invoke_async(&mut conn)
.await
.map_err(|e| GitError::IoError(format!("failed to release lock: {}", e)))?;
Ok(())
}
}

View File

@ -3,6 +3,7 @@ pub mod commit;
pub mod fsck; pub mod fsck;
pub mod gc; pub mod gc;
pub mod lfs; pub mod lfs;
pub mod lock;
pub mod tag; pub mod tag;
use db::cache::AppCache; use db::cache::AppCache;
@ -158,73 +159,56 @@ impl HookMetaDataSync {
} }
/// Full sync: refs → commits → tags → LFS → fsck → gc → skills. /// Full sync: refs → commits → tags → LFS → fsck → gc → skills.
/// No distributed lock — K8s StatefulSet scheduling guarantees exclusive access. /// Acquires a per-repo Redis distributed lock to prevent concurrent workers
/// from processing the same repository simultaneously.
pub async fn sync(&self) -> Result<(), crate::GitError> { pub async fn sync(&self) -> Result<(), crate::GitError> {
// All git2 operations must run on a blocking thread since git2 types are not Send. let lock_value = self.acquire_lock().await?;
let db = self.db.clone();
let cache = self.cache.clone();
let repo = self.repo.clone();
let logger = self.logger.clone();
let res = tokio::task::spawn_blocking(move || { let res = self.sync_full().await;
tokio::runtime::Handle::current().block_on(async move {
let sync = Self::new(db, cache, repo, logger)?; if let Err(ref e) = res {
let out = sync.sync_full().await; slog::error!(self.logger, "sync failed: {}", e);
if let Err(ref e) = out {
slog::error!(sync.logger, "sync failed: {}", e);
} }
out
})
})
.await
.map_err(|e| crate::GitError::Internal(format!("spawn_blocking join error: {}", e)))??;
Ok(res) if let Err(release_err) = self.release_lock(&lock_value).await {
slog::error!(self.logger, "failed to release lock: {}", release_err);
}
res
} }
/// Run fsck only (refs snapshot + git fsck + rollback on corruption). /// Run fsck only (refs snapshot + git fsck + rollback on corruption).
/// Acquires a per-repo Redis lock.
pub async fn fsck_only(&self) -> Result<(), crate::GitError> { pub async fn fsck_only(&self) -> Result<(), crate::GitError> {
let db = self.db.clone(); let lock_value = self.acquire_lock().await?;
let cache = self.cache.clone();
let repo = self.repo.clone();
let logger = self.logger.clone();
tokio::task::spawn_blocking(move || { let res = async {
tokio::runtime::Handle::current().block_on(async move { let mut txn = self
let sync = Self::new(db, cache, repo, logger)?; .db
let mut txn = sync.db.begin().await.map_err(|e| { .begin()
crate::GitError::IoError(format!("failed to begin transaction: {}", e)) .await
})?; .map_err(|e| crate::GitError::IoError(format!("failed to begin transaction: {}", e)))?;
sync.run_fsck_and_rollback_if_corrupt(&mut txn).await?; self.run_fsck_and_rollback_if_corrupt(&mut txn)
.await?;
txn.commit().await.map_err(|e| { txn.commit().await.map_err(|e| {
crate::GitError::IoError(format!("failed to commit transaction: {}", e)) crate::GitError::IoError(format!("failed to commit transaction: {}", e))
})?; })?;
Ok::<(), crate::GitError>(()) Ok::<(), crate::GitError>(())
}) }
}) .await;
.await
.map_err(|e| crate::GitError::Internal(format!("spawn_blocking join error: {}", e)))??;
Ok(()) let _ = self.release_lock(&lock_value).await;
res
} }
/// Run gc only. /// Run gc only. Acquires a per-repo Redis lock.
pub async fn gc_only(&self) -> Result<(), crate::GitError> { pub async fn gc_only(&self) -> Result<(), crate::GitError> {
let db = self.db.clone(); let lock_value = self.acquire_lock().await?;
let cache = self.cache.clone();
let repo = self.repo.clone();
let logger = self.logger.clone();
tokio::task::spawn_blocking(move || { let res = self.run_gc().await;
tokio::runtime::Handle::current().block_on(async move {
let sync = Self::new(db, cache, repo, logger)?;
sync.run_gc().await
})
})
.await
.map_err(|e| crate::GitError::Internal(format!("spawn_blocking join error: {}", e)))??;
Ok(()) let _ = self.release_lock(&lock_value).await;
res
} }
/// Full sync pipeline inside a single DB transaction. /// Full sync pipeline inside a single DB transaction.

View File

@ -72,16 +72,19 @@ pub async fn run_http(config: AppConfig, logger: Logger) -> anyhow::Result<()> {
let app_cache = app_cache?; let app_cache = app_cache?;
let redis_pool = app_cache.redis_pool().clone(); let redis_pool = app_cache.redis_pool().clone();
let http = Arc::new(reqwest::Client::new()); let http_client = Arc::new(reqwest::Client::new());
let hook = HookService::new( let hook = HookService::new(
db.clone(), db.clone(),
app_cache.clone(), app_cache.clone(),
redis_pool.clone(), redis_pool.clone(),
logger.clone(), logger.clone(),
config.clone(), config.clone(),
http, http_client,
); );
let sync = crate::ssh::ReceiveSyncService::new(hook); let _worker_cancel = hook.start_worker();
slog::info!(logger, "hook worker started");
let sync = crate::ssh::ReceiveSyncService::new(redis_pool.clone(), logger.clone());
let rate_limiter = Arc::new(rate_limit::RateLimiter::new( let rate_limiter = Arc::new(rate_limit::RateLimiter::new(
rate_limit::RateLimitConfig::default(), rate_limit::RateLimitConfig::default(),

View File

@ -36,6 +36,9 @@ pub use diff::types::{
}; };
pub use domain::GitDomain; pub use domain::GitDomain;
pub use error::{GitError, GitResult}; pub use error::{GitError, GitResult};
pub use hook::pool::types::{HookTask, TaskType};
pub use hook::pool::PoolConfig;
pub use hook::pool::HookWorker;
pub use hook::sync::HookMetaDataSync; pub use hook::sync::HookMetaDataSync;
pub use lfs::types::{LfsConfig, LfsEntry, LfsOid, LfsPointer}; pub use lfs::types::{LfsConfig, LfsEntry, LfsOid, LfsPointer};
pub use merge::types::{MergeAnalysisResult, MergeOptions, MergePreferenceResult, MergeheadInfo}; pub use merge::types::{MergeAnalysisResult, MergeOptions, MergePreferenceResult, MergeheadInfo};

View File

@ -1,5 +1,6 @@
use crate::error::GitError; use crate::error::GitError;
use crate::hook::HookService; use crate::hook::HookService;
use crate::hook::pool::types::{HookTask, TaskType};
use anyhow::Context; use anyhow::Context;
use base64::Engine; use base64::Engine;
use config::AppConfig; use config::AppConfig;
@ -137,22 +138,12 @@ impl SSHHandle {
"SSH server configured with methods: {:?}", config.methods "SSH server configured with methods: {:?}", config.methods
); );
let token_service = SshTokenService::new(self.db.clone()); let token_service = SshTokenService::new(self.db.clone());
let http = Arc::new(reqwest::Client::new());
let hook = crate::hook::HookService::new(
self.db.clone(),
self.cache.clone(),
self.redis_pool.clone(),
self.logger.clone(),
self.app.clone(),
http,
);
let mut server = server::SSHServer::new( let mut server = server::SSHServer::new(
self.db.clone(), self.db.clone(),
self.cache.clone(), self.cache.clone(),
self.redis_pool.clone(), self.redis_pool.clone(),
self.logger.clone(), self.logger.clone(),
token_service, token_service,
hook,
); );
// Start the rate limiter cleanup background task so the HashMap // Start the rate limiter cleanup background task so the HashMap
@ -177,33 +168,63 @@ impl SSHHandle {
} }
} }
/// Direct sync service — calls HookService::sync_repo inline. /// Enqueues a sync task to the Redis-backed hook queue.
/// K8s StatefulSet HA scheduling ensures exclusive access per repo shard. /// The background worker picks it up and processes it with per-repo locking.
#[derive(Clone)] #[derive(Clone)]
pub struct ReceiveSyncService { pub struct ReceiveSyncService {
hook: HookService, pool: deadpool_redis::cluster::Pool,
logger: Logger,
redis_prefix: String,
} }
impl ReceiveSyncService { impl ReceiveSyncService {
pub fn new(hook: HookService) -> Self { pub fn new(pool: deadpool_redis::cluster::Pool, logger: Logger) -> Self {
Self { hook } Self {
pool,
logger,
redis_prefix: "{hook}".to_string(),
}
} }
/// Execute a full repo sync synchronously. /// Enqueue a sync task. Fire-and-forget — logs errors but does not block.
/// Returns Ok on success, Err on failure. pub async fn send(&self, task: RepoReceiveSyncTask) {
pub async fn send(&self, task: RepoReceiveSyncTask) -> Result<(), crate::GitError> { let hook_task = HookTask {
let repo_id = task.repo_uid.to_string(); id: uuid::Uuid::new_v4().to_string(),
slog::info!(self.hook.logger, "starting sync repo_id={}", repo_id); repo_id: task.repo_uid.to_string(),
let res = self.hook.sync_repo(&repo_id).await; task_type: TaskType::Sync,
match &res { payload: serde_json::Value::Null,
Ok(()) => { created_at: chrono::Utc::now(),
slog::info!(self.hook.logger, "sync completed repo_id={}", repo_id); retry_count: 0,
} };
let task_json = match serde_json::to_string(&hook_task) {
Ok(j) => j,
Err(e) => { Err(e) => {
slog::error!(self.hook.logger, "sync failed repo_id={} error={}", repo_id, e); error!(self.logger, "failed to serialize hook task: {}", e);
return;
} }
};
let queue_key = format!("{}:sync", self.redis_prefix);
let redis = match self.pool.get().await {
Ok(c) => c,
Err(e) => {
error!(self.logger, "failed to get Redis connection: {}", e);
return;
}
};
let mut conn: deadpool_redis::cluster::Connection = redis;
if let Err(e) = redis::cmd("LPUSH")
.arg(&queue_key)
.arg(&task_json)
.query_async::<()>(&mut conn)
.await
{
error!(self.logger, "failed to enqueue sync task repo_id={} error={}",
task.repo_uid, e);
} }
res
} }
} }
@ -267,6 +288,19 @@ pub async fn run_ssh(config: AppConfig, logger: Logger) -> anyhow::Result<()> {
let db = AppDatabase::init(&config).await?; let db = AppDatabase::init(&config).await?;
let cache = AppCache::init(&config).await?; let cache = AppCache::init(&config).await?;
let redis_pool = cache.redis_pool().clone(); let redis_pool = cache.redis_pool().clone();
// Start the hook worker (Redis queue consumer)
let hook = crate::hook::HookService::new(
db.clone(),
cache.clone(),
redis_pool.clone(),
logger.clone(),
config.clone(),
Arc::new(reqwest::Client::new()),
);
let _worker_cancel = hook.start_worker();
slog::info!(logger, "hook worker started");
SSHHandle::new(db, config.clone(), cache, redis_pool, logger) SSHHandle::new(db, config.clone(), cache, redis_pool, logger)
.run_ssh() .run_ssh()
.await?; .await?;

View File

@ -1,4 +1,3 @@
use crate::hook::HookService;
use crate::ssh::ReceiveSyncService; use crate::ssh::ReceiveSyncService;
use crate::ssh::SshTokenService; use crate::ssh::SshTokenService;
use crate::ssh::handle::SSHandle; use crate::ssh::handle::SSHandle;
@ -16,7 +15,6 @@ pub struct SSHServer {
pub redis_pool: RedisPool, pub redis_pool: RedisPool,
pub logger: Logger, pub logger: Logger,
pub token_service: SshTokenService, pub token_service: SshTokenService,
pub hook: HookService,
} }
impl SSHServer { impl SSHServer {
@ -26,7 +24,6 @@ impl SSHServer {
redis_pool: RedisPool, redis_pool: RedisPool,
logger: Logger, logger: Logger,
token_service: SshTokenService, token_service: SshTokenService,
hook: HookService,
) -> Self { ) -> Self {
SSHServer { SSHServer {
db, db,
@ -34,7 +31,6 @@ impl SSHServer {
redis_pool, redis_pool,
logger, logger,
token_service, token_service,
hook,
} }
} }
} }
@ -52,7 +48,7 @@ impl russh::server::Server for SSHServer {
} else { } else {
info!(self.logger, "New SSH connection from unknown address"); info!(self.logger, "New SSH connection from unknown address");
} }
let sync_service = ReceiveSyncService::new(self.hook.clone()); let sync_service = ReceiveSyncService::new(self.redis_pool.clone(), self.logger.clone());
SSHandle::new( SSHandle::new(
self.db.clone(), self.db.clone(),
self.cache.clone(), self.cache.clone(),