feat: add sitemap index with static/users/projects/repos sub-sitemaps
Some checks are pending
CI / Rust Lint & Check (push) Waiting to run
CI / Rust Tests (push) Waiting to run
CI / Frontend Lint & Type Check (push) Waiting to run
CI / Frontend Build (push) Blocked by required conditions

- Main sitemap index at /sitemap.xml referencing 4 sub-sitemaps
- /sidemap/static: fixed routes (homepage, auth, marketing pages)
- /sidemap/users: public user profiles sorted alphabetically
- /sidemap/projects: public projects sorted alphabetically
- /sidemap/repos: public repos sorted alphabetically
- Redis cache with 8h TTL (no refresh on access), key: sidemap:{type}
- robots.txt Sitemap URL uses main_domain() with https:// forced
- All sitemap loc entries use https:// base URL
This commit is contained in:
ZhenYi 2026-04-26 00:03:18 +08:00
parent a8494cc032
commit d593354ba9
5 changed files with 364 additions and 8 deletions

1
Cargo.lock generated
View File

@ -647,6 +647,7 @@ dependencies = [
"mime_guess2", "mime_guess2",
"models", "models",
"queue", "queue",
"redis",
"room", "room",
"rust_decimal", "rust_decimal",
"sea-orm", "sea-orm",

View File

@ -13,7 +13,7 @@ use observability::{
use sea_orm::ConnectionTrait; use sea_orm::ConnectionTrait;
use service::AppService; use service::AppService;
use session::config::{PersistentSession, SessionLifecycle, TtlExtensionPolicy}; use session::config::{PersistentSession, SessionLifecycle, TtlExtensionPolicy};
use api::robots; use api::{robots, sidemap};
use session::storage::RedisClusterSessionStore; use session::storage::RedisClusterSessionStore;
use session::SessionMiddleware; use session::SessionMiddleware;
use std::task::{Context, Poll}; use std::task::{Context, Poll};
@ -220,6 +220,15 @@ async fn main() -> anyhow::Result<()> {
.app_data(http_snapshot_data.clone()) .app_data(http_snapshot_data.clone())
.app_data(prometheus_handle_data.clone()) .app_data(prometheus_handle_data.clone())
.route("/robots.txt", web::get().to(robots::robots)) .route("/robots.txt", web::get().to(robots::robots))
.route("/sitemap.xml", web::get().to(sidemap::sitemap))
.service(
web::scope("/sidemap")
.route("", web::get().to(sidemap::sitemap))
.route("/static", web::get().to(sidemap::sitemap_static))
.route("/users", web::get().to(sidemap::sitemap_users))
.route("/projects", web::get().to(sidemap::sitemap_projects))
.route("/repos", web::get().to(sidemap::sitemap_repos)),
)
.route("/health", web::get().to(health_check)) .route("/health", web::get().to(health_check))
.route("/metrics", web::get().to(prometheus_handler)) .route("/metrics", web::get().to(prometheus_handler))
.configure(api::route::init_routes) .configure(api::route::init_routes)

View File

@ -49,5 +49,6 @@ mime_guess2 = { workspace = true, features = ["phf-map"] }
sea-orm = "2.0.0-rc.37" sea-orm = "2.0.0-rc.37"
rust_decimal = "1.40.0" rust_decimal = "1.40.0"
actix-multipart = { workspace = true, features = ["tempfile"] } actix-multipart = { workspace = true, features = ["tempfile"] }
redis = { workspace = true }
[lints] [lints]
workspace = true workspace = true

View File

@ -1,11 +1,22 @@
use actix_web::HttpResponse; use actix_web::{web, HttpResponse};
use service::AppService;
/// Serves robots.txt, blocking all sensitive paths from crawlers. /// Serves robots.txt, blocking all sensitive paths from crawlers.
pub async fn robots() -> HttpResponse { pub async fn robots(service: web::Data<AppService>) -> HttpResponse {
HttpResponse::Ok() let raw = service
.content_type("text/plain; charset=utf-8") .config
.body( .main_domain()
r#"User-agent: * .unwrap_or_else(|_| "https://gitdata.ai".to_string());
let sitemap_base = if raw.starts_with("https://") {
raw.trim_end_matches('/').to_string()
} else if raw.starts_with("http://") {
raw.replacen("http://", "https://", 1)
} else {
format!("https://{raw}")
};
let body = format!(
r#"User-agent: *
Disallow: /api/ Disallow: /api/
Disallow: /health Disallow: /health
Disallow: /metrics Disallow: /metrics
@ -15,6 +26,12 @@ Disallow: /blob/
Disallow: /media/ Disallow: /media/
Disallow: /static/ Disallow: /static/
Disallow: /assets/ Disallow: /assets/
Sitemap: {sitemap_base}/sitemap.xml
"#, "#,
) );
HttpResponse::Ok()
.content_type("text/plain; charset=utf-8")
.body(body)
} }

View File

@ -0,0 +1,328 @@
use actix_web::{web, HttpResponse};
use db::cache::AppCache;
use models::projects::project::{Column as PCol, Entity as PEntity};
use models::repos::repo::{Column as RCol, Entity as REntity};
use models::users::user::{Column as UCol, Entity as UEntity};
use sea_orm::*;
use service::AppService;
const CACHE_KEY_PREFIX: &str = "sidemap";
const CACHE_TTL_SECS: u64 = 8 * 3600; // 8 hours, no refresh
/// Returns the base URL, forcing https:// prefix for public sitemap crawlers.
fn public_base(config: &config::AppConfig) -> String {
let fallback = "https://gitdata.ai".to_string();
let base = match config.main_domain() {
Ok(b) => b.trim_end_matches('/').to_string(),
Err(_) => fallback,
};
if base.starts_with("https://") {
base
} else if base.starts_with("http://") {
base.replacen("http://", "https://", 1)
} else {
format!("https://{base}")
}
}
// ── Handlers ──────────────────────────────────────────────────────────────────
/// Main sitemap index referencing all sub-sitemaps.
pub async fn sitemap(service: web::Data<AppService>) -> HttpResponse {
let base = public_base(&service.config);
let xml = format!(
r#"<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap>
<loc>{base}/sidemap/static</loc>
</sitemap>
<sitemap>
<loc>{base}/sidemap/users</loc>
</sitemap>
<sitemap>
<loc>{base}/sidemap/projects</loc>
</sitemap>
<sitemap>
<loc>{base}/sidemap/repos</loc>
</sitemap>
</sitemapindex>"#
);
HttpResponse::Ok()
.content_type("application/xml; charset=utf-8")
.body(xml)
}
/// Static routes (no DB, no cache).
pub async fn sitemap_static(service: web::Data<AppService>) -> HttpResponse {
let base = public_base(&service.config);
HttpResponse::Ok()
.content_type("application/xml; charset=utf-8")
.body(build_static_xml(&base))
}
/// User profiles sitemap.
pub async fn sitemap_users(service: web::Data<AppService>) -> HttpResponse {
let base = public_base(&service.config);
let xml = cached_or_build(&service.cache, "users", || async {
let db = service.db.reader();
let users: Vec<(String, String)> = UEntity::find()
.filter(UCol::Username.ne(""))
.order_by_asc(UCol::Username)
.all(db)
.await
.unwrap_or_default()
.into_iter()
.map(|u| (u.username, u.updated_at.to_rfc3339()))
.collect();
Ok(build_users_xml(&base, &users))
})
.await
.unwrap_or_else(|_| build_users_xml(&base, &[]));
HttpResponse::Ok()
.content_type("application/xml; charset=utf-8")
.body(xml)
}
/// Public projects sitemap.
pub async fn sitemap_projects(service: web::Data<AppService>) -> HttpResponse {
let base = public_base(&service.config);
let xml = cached_or_build(&service.cache, "projects", || async {
let db = service.db.reader();
let projects: Vec<(String, String, String)> = PEntity::find()
.filter(PCol::IsPublic.eq(true))
.order_by_asc(PCol::Name)
.all(db)
.await
.unwrap_or_default()
.into_iter()
.map(|p| (p.name, p.id.to_string(), p.updated_at.to_rfc3339()))
.collect();
Ok(build_projects_xml(&base, &projects))
})
.await
.unwrap_or_else(|_| build_projects_xml(&base, &[]));
HttpResponse::Ok()
.content_type("application/xml; charset=utf-8")
.body(xml)
}
/// Public repos sitemap.
pub async fn sitemap_repos(service: web::Data<AppService>) -> HttpResponse {
let base = public_base(&service.config);
let xml = cached_or_build(&service.cache, "repos", || async {
let db = service.db.reader();
let project_map: std::collections::HashMap<String, String> = PEntity::find()
.filter(PCol::IsPublic.eq(true))
.all(db)
.await
.unwrap_or_default()
.into_iter()
.map(|p| (p.id.to_string(), p.name))
.collect();
let repos: Vec<(String, String)> = REntity::find()
.filter(RCol::IsPrivate.eq(false))
.order_by_asc(RCol::RepoName)
.all(db)
.await
.unwrap_or_default()
.into_iter()
.filter_map(|r| {
let ns = project_map.get(&r.project.to_string())?;
Some((format!("{ns}/{}", r.repo_name), r.updated_at.to_rfc3339()))
})
.collect();
Ok(build_repos_xml(&base, &repos))
})
.await
.unwrap_or_else(|_| build_repos_xml(&base, &[]));
HttpResponse::Ok()
.content_type("application/xml; charset=utf-8")
.body(xml)
}
// ── Cache helpers ────────────────────────────────────────────────────────────────
async fn cached_or_build<F, Fut>(cache: &AppCache, key: &str, build: F) -> Result<String, ()>
where
F: FnOnce() -> Fut,
Fut: std::future::Future<Output = Result<String, ()>>,
{
let cache_key = format!("{CACHE_KEY_PREFIX}:{key}");
if let Ok(xml) = get_cached(cache, &cache_key).await {
return Ok(xml);
}
let xml = build().await?;
let _ = set_cached(cache, &cache_key, &xml).await;
Ok(xml)
}
async fn get_cached(cache: &AppCache, key: &str) -> Result<String, ()> {
let mut conn = cache.redis_pool().get().await.map_err(|e| {
tracing::debug!("sidemap redis get pool error: {}", e);
})?;
redis::cmd("GET")
.arg(key)
.query_async::<String>(&mut conn)
.await
.map_err(|e| {
tracing::debug!("sidemap redis get error: {}", e);
})
}
async fn set_cached(cache: &AppCache, key: &str, value: &str) -> Result<(), ()> {
let mut conn = cache.redis_pool().get().await.map_err(|e| {
tracing::debug!("sidemap redis set pool error: {}", e);
})?;
redis::cmd("SETEX")
.arg(key)
.arg(CACHE_TTL_SECS)
.arg(value)
.query_async::<()>(&mut conn)
.await
.map_err(|e| {
tracing::debug!("sidemap redis set error: {}", e);
})
}
// ── XML builders ────────────────────────────────────────────────────────────────
fn build_static_xml(base: &str) -> String {
let mut xml = xml_header();
for loc in [
"/",
"/auth/login",
"/auth/register",
"/auth/password/reset",
"/auth/reset-password",
"/auth/verify-email",
"/about",
"/pricing",
"/pricing/enterprise",
"/pricing/faq",
"/skills",
"/skills/publish",
"/skills/docs",
"/solutions",
"/solutions/rooms",
"/solutions/memory",
"/solutions/governance",
"/network",
"/network/rooms",
"/network/api",
"/docs",
] {
xml.push_str(&url_entry(&format!("{base}{loc}"), 0.9, "daily", None));
}
xml.push_str("</urlset>");
xml
}
fn build_users_xml(base: &str, users: &[(String, String)]) -> String {
let mut xml = xml_header();
for (username, updated) in users {
xml.push_str(&url_entry(
&format!("{base}/user/{username}"),
0.6,
"weekly",
Some(updated),
));
}
xml.push_str("</urlset>");
xml
}
fn build_projects_xml(base: &str, projects: &[(String, String, String)]) -> String {
let mut xml = xml_header();
for (name, _, updated) in projects {
xml.push_str(&url_entry(
&format!("{base}/project/{name}"),
0.7,
"weekly",
Some(updated),
));
for sub in [
"/activity",
"/repositories",
"/issues",
"/members",
"/articles",
"/resources",
] {
xml.push_str(&url_entry(
&format!("{base}/project/{name}{sub}"),
0.6,
"weekly",
Some(updated),
));
}
}
xml.push_str("</urlset>");
xml
}
fn build_repos_xml(base: &str, repos: &[(String, String)]) -> String {
let mut xml = xml_header();
for (path, updated) in repos {
xml.push_str(&url_entry(
&format!("{base}/repository/{path}"),
0.7,
"daily",
Some(updated),
));
for sub in [
"/files",
"/commits",
"/branches",
"/tags",
"/contributors",
"/pull-requests",
] {
xml.push_str(&url_entry(
&format!("{base}/repository/{path}{sub}"),
0.6,
"daily",
Some(updated),
));
}
}
xml.push_str("</urlset>");
xml
}
fn xml_header() -> String {
String::from(
r#"<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
"#,
)
}
fn url_entry(loc: &str, priority: f32, changefreq: &str, updated: Option<&str>) -> String {
let updated_xml = updated
.map(|d| format!("\n <lastmod>{d}</lastmod>"))
.unwrap_or_default();
format!(
r#" <url>
<loc>{loc}</loc>{updated_xml}
<changefreq>{changefreq}</changefreq>
<priority>{priority}</priority>
</url>
"#,
)
}