feat(gitserver): add /robots.txt to disallow all crawlers
- Returns Disallow: / for all user-agents - Points crawlers to main site sitemap via APP_GIT_HTTP_DOMAIN
This commit is contained in:
parent
7eb9c5a7fb
commit
02b7a5beda
@ -295,4 +295,8 @@ readinessProbe:
|
|||||||
|
|
||||||
**域名**:`robots.txt` 中的 Sitemap URL 动态读取 `APP_DOMAIN_URL` 环境变量并强制使用 `https://` 前缀。
|
**域名**:`robots.txt` 中的 Sitemap URL 动态读取 `APP_DOMAIN_URL` 环境变量并强制使用 `https://` 前缀。
|
||||||
|
|
||||||
**禁止爬取**:`/sidemap/` 子路径在 `robots.txt` 中未声明,由主站统一控制,仅 `sitemapindex` 对外可见。
|
**禁止爬取**:
|
||||||
|
- 主站 `app` (`/robots.txt`) — 禁止 `/api/`、`/health`、`/metrics`、`/ws/` 等,同时声明 `Sitemap: {APP_DOMAIN_URL}/sitemap.xml`
|
||||||
|
- Gitserver (`/robots.txt`) — `Disallow: /` 禁止所有路由,并指向主站 sitemap
|
||||||
|
|
||||||
|
**缓存策略**:用户/项目/仓库 sitemap 数据通过 Redis 缓存,TTL 8 小时,访问不续期,过期自动重新生成。缓存在 key `sidemap:{users,projects,repos}` 下。
|
||||||
|
|||||||
@ -20,6 +20,25 @@ pub struct HttpAppState {
|
|||||||
pub cache: AppCache,
|
pub cache: AppCache,
|
||||||
pub sync: crate::ssh::ReceiveSyncService,
|
pub sync: crate::ssh::ReceiveSyncService,
|
||||||
pub rate_limiter: Arc<rate_limit::RateLimiter>,
|
pub rate_limiter: Arc<rate_limit::RateLimiter>,
|
||||||
|
pub config: AppConfig,
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn robots(state: web::Data<HttpAppState>) -> HttpResponse {
|
||||||
|
let sitemap_url = state
|
||||||
|
.config
|
||||||
|
.git_http_domain()
|
||||||
|
.map(|d| format!("{}/sitemap.xml", d.trim_end_matches('/')))
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
let body = if sitemap_url.is_empty() {
|
||||||
|
"User-agent: *\nDisallow: /\n".to_string()
|
||||||
|
} else {
|
||||||
|
format!("User-agent: *\nDisallow: /\n\nSitemap: {sitemap_url}\n")
|
||||||
|
};
|
||||||
|
|
||||||
|
HttpResponse::Ok()
|
||||||
|
.content_type("text/plain; charset=utf-8")
|
||||||
|
.body(body)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn health(state: web::Data<HttpAppState>) -> HttpResponse {
|
async fn health(state: web::Data<HttpAppState>) -> HttpResponse {
|
||||||
@ -49,7 +68,8 @@ async fn health(state: web::Data<HttpAppState>) -> HttpResponse {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn git_http_cfg(cfg: &mut web::ServiceConfig) {
|
pub fn git_http_cfg(cfg: &mut web::ServiceConfig) {
|
||||||
cfg.route("/health", web::get().to(health))
|
cfg.route("/robots.txt", web::get().to(robots))
|
||||||
|
.route("/health", web::get().to(health))
|
||||||
.route(
|
.route(
|
||||||
"/{namespace}/{repo_name}.git/info/refs",
|
"/{namespace}/{repo_name}.git/info/refs",
|
||||||
web::get().to(routes::info_refs),
|
web::get().to(routes::info_refs),
|
||||||
@ -119,6 +139,7 @@ pub async fn run_http(config: AppConfig) -> anyhow::Result<()> {
|
|||||||
cache: app_cache.clone(),
|
cache: app_cache.clone(),
|
||||||
sync,
|
sync,
|
||||||
rate_limiter,
|
rate_limiter,
|
||||||
|
config: config.clone(),
|
||||||
};
|
};
|
||||||
|
|
||||||
tracing::info!("Starting git HTTP server on 0.0.0.0:8021");
|
tracing::info!("Starting git HTTP server on 0.0.0.0:8021");
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user