From 02b7a5bedaa79872d6ddf35b6d519e40c1255bbf Mon Sep 17 00:00:00 2001 From: ZhenYi <434836402@qq.com> Date: Sun, 26 Apr 2026 00:16:21 +0800 Subject: [PATCH] feat(gitserver): add /robots.txt to disallow all crawlers - Returns Disallow: / for all user-agents - Points crawlers to main site sitemap via APP_GIT_HTTP_DOMAIN --- docs/metrics.md | 6 +++++- libs/git/http/mod.rs | 23 ++++++++++++++++++++++- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/docs/metrics.md b/docs/metrics.md index 7255652..6d79101 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -295,4 +295,8 @@ readinessProbe: **域名**:`robots.txt` 中的 Sitemap URL 动态读取 `APP_DOMAIN_URL` 环境变量并强制使用 `https://` 前缀。 -**禁止爬取**:`/sidemap/` 子路径在 `robots.txt` 中未声明,由主站统一控制,仅 `sitemapindex` 对外可见。 +**禁止爬取**: +- 主站 `app` (`/robots.txt`) — 禁止 `/api/`、`/health`、`/metrics`、`/ws/` 等,同时声明 `Sitemap: {APP_DOMAIN_URL}/sitemap.xml` +- Gitserver (`/robots.txt`) — `Disallow: /` 禁止所有路由,并指向主站 sitemap + +**缓存策略**:用户/项目/仓库 sitemap 数据通过 Redis 缓存,TTL 8 小时,访问不续期,过期自动重新生成。缓存在 key `sidemap:{users,projects,repos}` 下。 diff --git a/libs/git/http/mod.rs b/libs/git/http/mod.rs index c9b4967..56607e8 100644 --- a/libs/git/http/mod.rs +++ b/libs/git/http/mod.rs @@ -20,6 +20,25 @@ pub struct HttpAppState { pub cache: AppCache, pub sync: crate::ssh::ReceiveSyncService, pub rate_limiter: Arc, + pub config: AppConfig, +} + +async fn robots(state: web::Data) -> HttpResponse { + let sitemap_url = state + .config + .git_http_domain() + .map(|d| format!("{}/sitemap.xml", d.trim_end_matches('/'))) + .unwrap_or_default(); + + let body = if sitemap_url.is_empty() { + "User-agent: *\nDisallow: /\n".to_string() + } else { + format!("User-agent: *\nDisallow: /\n\nSitemap: {sitemap_url}\n") + }; + + HttpResponse::Ok() + .content_type("text/plain; charset=utf-8") + .body(body) } async fn health(state: web::Data) -> HttpResponse { @@ -49,7 +68,8 @@ async fn health(state: web::Data) -> HttpResponse { } pub fn git_http_cfg(cfg: &mut web::ServiceConfig) { - cfg.route("/health", web::get().to(health)) + cfg.route("/robots.txt", web::get().to(robots)) + .route("/health", web::get().to(health)) .route( "/{namespace}/{repo_name}.git/info/refs", web::get().to(routes::info_refs), @@ -119,6 +139,7 @@ pub async fn run_http(config: AppConfig) -> anyhow::Result<()> { cache: app_cache.clone(), sync, rate_limiter, + config: config.clone(), }; tracing::info!("Starting git HTTP server on 0.0.0.0:8021");