From d593354ba9a425cad83ac7fa11b82a523a992d7e Mon Sep 17 00:00:00 2001 From: ZhenYi <434836402@qq.com> Date: Sun, 26 Apr 2026 00:03:18 +0800 Subject: [PATCH] feat: add sitemap index with static/users/projects/repos sub-sitemaps - Main sitemap index at /sitemap.xml referencing 4 sub-sitemaps - /sidemap/static: fixed routes (homepage, auth, marketing pages) - /sidemap/users: public user profiles sorted alphabetically - /sidemap/projects: public projects sorted alphabetically - /sidemap/repos: public repos sorted alphabetically - Redis cache with 8h TTL (no refresh on access), key: sidemap:{type} - robots.txt Sitemap URL uses main_domain() with https:// forced - All sitemap loc entries use https:// base URL --- Cargo.lock | 1 + apps/app/src/main.rs | 11 +- libs/api/Cargo.toml | 1 + libs/api/robots.rs | 31 +++- libs/api/sidemap.rs | 328 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 364 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9840d0f..fe2d102 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -647,6 +647,7 @@ dependencies = [ "mime_guess2", "models", "queue", + "redis", "room", "rust_decimal", "sea-orm", diff --git a/apps/app/src/main.rs b/apps/app/src/main.rs index 0ff0955..723bb5d 100644 --- a/apps/app/src/main.rs +++ b/apps/app/src/main.rs @@ -13,7 +13,7 @@ use observability::{ use sea_orm::ConnectionTrait; use service::AppService; use session::config::{PersistentSession, SessionLifecycle, TtlExtensionPolicy}; -use api::robots; +use api::{robots, sidemap}; use session::storage::RedisClusterSessionStore; use session::SessionMiddleware; use std::task::{Context, Poll}; @@ -220,6 +220,15 @@ async fn main() -> anyhow::Result<()> { .app_data(http_snapshot_data.clone()) .app_data(prometheus_handle_data.clone()) .route("/robots.txt", web::get().to(robots::robots)) + .route("/sitemap.xml", web::get().to(sidemap::sitemap)) + .service( + web::scope("/sidemap") + .route("", web::get().to(sidemap::sitemap)) + .route("/static", web::get().to(sidemap::sitemap_static)) + .route("/users", web::get().to(sidemap::sitemap_users)) + .route("/projects", web::get().to(sidemap::sitemap_projects)) + .route("/repos", web::get().to(sidemap::sitemap_repos)), + ) .route("/health", web::get().to(health_check)) .route("/metrics", web::get().to(prometheus_handler)) .configure(api::route::init_routes) diff --git a/libs/api/Cargo.toml b/libs/api/Cargo.toml index 83214f1..9bf67cc 100644 --- a/libs/api/Cargo.toml +++ b/libs/api/Cargo.toml @@ -49,5 +49,6 @@ mime_guess2 = { workspace = true, features = ["phf-map"] } sea-orm = "2.0.0-rc.37" rust_decimal = "1.40.0" actix-multipart = { workspace = true, features = ["tempfile"] } +redis = { workspace = true } [lints] workspace = true diff --git a/libs/api/robots.rs b/libs/api/robots.rs index 2eaef3e..e0182d4 100644 --- a/libs/api/robots.rs +++ b/libs/api/robots.rs @@ -1,11 +1,22 @@ -use actix_web::HttpResponse; +use actix_web::{web, HttpResponse}; +use service::AppService; /// Serves robots.txt, blocking all sensitive paths from crawlers. -pub async fn robots() -> HttpResponse { - HttpResponse::Ok() - .content_type("text/plain; charset=utf-8") - .body( - r#"User-agent: * +pub async fn robots(service: web::Data) -> HttpResponse { + let raw = service + .config + .main_domain() + .unwrap_or_else(|_| "https://gitdata.ai".to_string()); + let sitemap_base = if raw.starts_with("https://") { + raw.trim_end_matches('/').to_string() + } else if raw.starts_with("http://") { + raw.replacen("http://", "https://", 1) + } else { + format!("https://{raw}") + }; + + let body = format!( + r#"User-agent: * Disallow: /api/ Disallow: /health Disallow: /metrics @@ -15,6 +26,12 @@ Disallow: /blob/ Disallow: /media/ Disallow: /static/ Disallow: /assets/ + +Sitemap: {sitemap_base}/sitemap.xml "#, - ) + ); + + HttpResponse::Ok() + .content_type("text/plain; charset=utf-8") + .body(body) } diff --git a/libs/api/sidemap.rs b/libs/api/sidemap.rs index e69de29..bd0679d 100644 --- a/libs/api/sidemap.rs +++ b/libs/api/sidemap.rs @@ -0,0 +1,328 @@ +use actix_web::{web, HttpResponse}; +use db::cache::AppCache; +use models::projects::project::{Column as PCol, Entity as PEntity}; +use models::repos::repo::{Column as RCol, Entity as REntity}; +use models::users::user::{Column as UCol, Entity as UEntity}; +use sea_orm::*; +use service::AppService; + +const CACHE_KEY_PREFIX: &str = "sidemap"; +const CACHE_TTL_SECS: u64 = 8 * 3600; // 8 hours, no refresh + +/// Returns the base URL, forcing https:// prefix for public sitemap crawlers. +fn public_base(config: &config::AppConfig) -> String { + let fallback = "https://gitdata.ai".to_string(); + let base = match config.main_domain() { + Ok(b) => b.trim_end_matches('/').to_string(), + Err(_) => fallback, + }; + if base.starts_with("https://") { + base + } else if base.starts_with("http://") { + base.replacen("http://", "https://", 1) + } else { + format!("https://{base}") + } +} + +// ── Handlers ────────────────────────────────────────────────────────────────── + +/// Main sitemap index referencing all sub-sitemaps. +pub async fn sitemap(service: web::Data) -> HttpResponse { + let base = public_base(&service.config); + + let xml = format!( + r#" + + + {base}/sidemap/static + + + {base}/sidemap/users + + + {base}/sidemap/projects + + + {base}/sidemap/repos + +"# + ); + + HttpResponse::Ok() + .content_type("application/xml; charset=utf-8") + .body(xml) +} + +/// Static routes (no DB, no cache). +pub async fn sitemap_static(service: web::Data) -> HttpResponse { + let base = public_base(&service.config); + + HttpResponse::Ok() + .content_type("application/xml; charset=utf-8") + .body(build_static_xml(&base)) +} + +/// User profiles sitemap. +pub async fn sitemap_users(service: web::Data) -> HttpResponse { + let base = public_base(&service.config); + + let xml = cached_or_build(&service.cache, "users", || async { + let db = service.db.reader(); + let users: Vec<(String, String)> = UEntity::find() + .filter(UCol::Username.ne("")) + .order_by_asc(UCol::Username) + .all(db) + .await + .unwrap_or_default() + .into_iter() + .map(|u| (u.username, u.updated_at.to_rfc3339())) + .collect(); + Ok(build_users_xml(&base, &users)) + }) + .await + .unwrap_or_else(|_| build_users_xml(&base, &[])); + + HttpResponse::Ok() + .content_type("application/xml; charset=utf-8") + .body(xml) +} + +/// Public projects sitemap. +pub async fn sitemap_projects(service: web::Data) -> HttpResponse { + let base = public_base(&service.config); + + let xml = cached_or_build(&service.cache, "projects", || async { + let db = service.db.reader(); + let projects: Vec<(String, String, String)> = PEntity::find() + .filter(PCol::IsPublic.eq(true)) + .order_by_asc(PCol::Name) + .all(db) + .await + .unwrap_or_default() + .into_iter() + .map(|p| (p.name, p.id.to_string(), p.updated_at.to_rfc3339())) + .collect(); + Ok(build_projects_xml(&base, &projects)) + }) + .await + .unwrap_or_else(|_| build_projects_xml(&base, &[])); + + HttpResponse::Ok() + .content_type("application/xml; charset=utf-8") + .body(xml) +} + +/// Public repos sitemap. +pub async fn sitemap_repos(service: web::Data) -> HttpResponse { + let base = public_base(&service.config); + + let xml = cached_or_build(&service.cache, "repos", || async { + let db = service.db.reader(); + + let project_map: std::collections::HashMap = PEntity::find() + .filter(PCol::IsPublic.eq(true)) + .all(db) + .await + .unwrap_or_default() + .into_iter() + .map(|p| (p.id.to_string(), p.name)) + .collect(); + + let repos: Vec<(String, String)> = REntity::find() + .filter(RCol::IsPrivate.eq(false)) + .order_by_asc(RCol::RepoName) + .all(db) + .await + .unwrap_or_default() + .into_iter() + .filter_map(|r| { + let ns = project_map.get(&r.project.to_string())?; + Some((format!("{ns}/{}", r.repo_name), r.updated_at.to_rfc3339())) + }) + .collect(); + + Ok(build_repos_xml(&base, &repos)) + }) + .await + .unwrap_or_else(|_| build_repos_xml(&base, &[])); + + HttpResponse::Ok() + .content_type("application/xml; charset=utf-8") + .body(xml) +} + +// ── Cache helpers ──────────────────────────────────────────────────────────────── + +async fn cached_or_build(cache: &AppCache, key: &str, build: F) -> Result +where + F: FnOnce() -> Fut, + Fut: std::future::Future>, +{ + let cache_key = format!("{CACHE_KEY_PREFIX}:{key}"); + + if let Ok(xml) = get_cached(cache, &cache_key).await { + return Ok(xml); + } + + let xml = build().await?; + + let _ = set_cached(cache, &cache_key, &xml).await; + + Ok(xml) +} + +async fn get_cached(cache: &AppCache, key: &str) -> Result { + let mut conn = cache.redis_pool().get().await.map_err(|e| { + tracing::debug!("sidemap redis get pool error: {}", e); + })?; + redis::cmd("GET") + .arg(key) + .query_async::(&mut conn) + .await + .map_err(|e| { + tracing::debug!("sidemap redis get error: {}", e); + }) +} + +async fn set_cached(cache: &AppCache, key: &str, value: &str) -> Result<(), ()> { + let mut conn = cache.redis_pool().get().await.map_err(|e| { + tracing::debug!("sidemap redis set pool error: {}", e); + })?; + redis::cmd("SETEX") + .arg(key) + .arg(CACHE_TTL_SECS) + .arg(value) + .query_async::<()>(&mut conn) + .await + .map_err(|e| { + tracing::debug!("sidemap redis set error: {}", e); + }) +} + +// ── XML builders ──────────────────────────────────────────────────────────────── + +fn build_static_xml(base: &str) -> String { + let mut xml = xml_header(); + for loc in [ + "/", + "/auth/login", + "/auth/register", + "/auth/password/reset", + "/auth/reset-password", + "/auth/verify-email", + "/about", + "/pricing", + "/pricing/enterprise", + "/pricing/faq", + "/skills", + "/skills/publish", + "/skills/docs", + "/solutions", + "/solutions/rooms", + "/solutions/memory", + "/solutions/governance", + "/network", + "/network/rooms", + "/network/api", + "/docs", + ] { + xml.push_str(&url_entry(&format!("{base}{loc}"), 0.9, "daily", None)); + } + xml.push_str(""); + xml +} + +fn build_users_xml(base: &str, users: &[(String, String)]) -> String { + let mut xml = xml_header(); + for (username, updated) in users { + xml.push_str(&url_entry( + &format!("{base}/user/{username}"), + 0.6, + "weekly", + Some(updated), + )); + } + xml.push_str(""); + xml +} + +fn build_projects_xml(base: &str, projects: &[(String, String, String)]) -> String { + let mut xml = xml_header(); + for (name, _, updated) in projects { + xml.push_str(&url_entry( + &format!("{base}/project/{name}"), + 0.7, + "weekly", + Some(updated), + )); + for sub in [ + "/activity", + "/repositories", + "/issues", + "/members", + "/articles", + "/resources", + ] { + xml.push_str(&url_entry( + &format!("{base}/project/{name}{sub}"), + 0.6, + "weekly", + Some(updated), + )); + } + } + xml.push_str(""); + xml +} + +fn build_repos_xml(base: &str, repos: &[(String, String)]) -> String { + let mut xml = xml_header(); + for (path, updated) in repos { + xml.push_str(&url_entry( + &format!("{base}/repository/{path}"), + 0.7, + "daily", + Some(updated), + )); + for sub in [ + "/files", + "/commits", + "/branches", + "/tags", + "/contributors", + "/pull-requests", + ] { + xml.push_str(&url_entry( + &format!("{base}/repository/{path}{sub}"), + 0.6, + "daily", + Some(updated), + )); + } + } + xml.push_str(""); + xml +} + +fn xml_header() -> String { + String::from( + r#" + +"#, + ) +} + +fn url_entry(loc: &str, priority: f32, changefreq: &str, updated: Option<&str>) -> String { + let updated_xml = updated + .map(|d| format!("\n {d}")) + .unwrap_or_default(); + format!( + r#" + {loc}{updated_xml} + {changefreq} + {priority} + +"#, + ) +}