gitdataai/libs/service/git/blob.rs
2026-04-15 09:08:09 +08:00

486 lines
15 KiB
Rust

use crate::AppService;
use crate::error::AppError;
use crate::git::BlobInfo;
use crate::git_spawn;
use base64::{Engine, engine::general_purpose::STANDARD as BASE64};
use redis::AsyncCommands;
use serde::{Deserialize, Serialize};
use session::Session;
const BLOB_CACHE_SIZE_LIMIT: usize = 512 * 1024;
const README_SIZE_LIMIT: usize = 1024 * 1024;
const README_VARIANTS: &[(&str, bool, bool)] = &[
("README.md", true, true),
("README.markdown", true, true),
("README.mkd", true, true),
("README.mkdn", true, true),
("README.mdown", true, true),
("README.rst", false, true),
("README.adoc", false, true),
("README.txt", true, true),
("README.md.txt", true, true),
("readme.md", true, true),
("Readme.md", true, true),
("README.MD", true, true),
("readme.markdown", true, true),
("Readme", false, true),
("readme", false, true),
("README", false, true),
("readme.rst", false, true),
("readme.txt", false, true),
("README.md.orig", true, true),
("README.md.bak", true, true),
("docs/README.md", true, false),
("doc/README.md", true, false),
("docs/README", false, false),
("doc/README", false, false),
("docs/README.markdown", true, false),
("doc/README.markdown", true, false),
("docs/readme.md", true, false),
("doc/readme.md", true, false),
(".github/README.md", true, false),
("wiki/README.md", true, false),
("site/README.md", true, false),
];
#[derive(Debug, Clone, Deserialize, utoipa::ToSchema)]
pub struct BlobGetQuery {
#[serde(default)]
pub oid: String,
}
#[derive(Debug, Clone, Serialize, utoipa::ToSchema)]
pub struct BlobInfoResponse {
pub oid: String,
pub size: usize,
pub is_binary: bool,
}
impl From<BlobInfo> for BlobInfoResponse {
fn from(b: BlobInfo) -> Self {
Self {
oid: b.oid.to_string(),
size: b.size,
is_binary: b.is_binary,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize, utoipa::ToSchema)]
pub struct BlobContentResponse {
pub oid: String,
pub size: usize,
pub is_binary: bool,
pub content: String,
}
#[derive(Debug, Clone, Serialize, utoipa::ToSchema)]
pub struct BlobExistsResponse {
pub oid: String,
pub exists: bool,
}
#[derive(Debug, Clone, Serialize, utoipa::ToSchema)]
pub struct BlobIsBinaryResponse {
pub oid: String,
pub is_binary: bool,
}
#[derive(Debug, Clone, Serialize, utoipa::ToSchema)]
pub struct BlobSizeResponse {
pub oid: String,
pub size: usize,
}
#[derive(Debug, Clone, Serialize, utoipa::ToSchema)]
pub struct BlobCreateResponse {
pub oid: String,
pub size: usize,
}
#[derive(Debug, Clone, Deserialize, utoipa::ToSchema)]
pub struct BlobCreateRequest {
pub data: String,
}
#[derive(Debug, Clone, Deserialize, utoipa::ToSchema)]
pub struct GitReadmeQuery {
pub r#ref: Option<String>,
}
#[derive(Debug, Clone, Serialize, utoipa::ToSchema)]
pub struct GitReadmeResponse {
pub path: Option<String>,
pub content: Option<String>,
pub size: Option<usize>,
pub encoding: Option<String>,
#[serde(default)]
pub truncated: bool,
#[serde(default)]
pub is_binary: bool,
}
impl AppService {
pub async fn git_readme(
&self,
namespace: String,
repo_name: String,
query: GitReadmeQuery,
ctx: &Session,
) -> Result<GitReadmeResponse, AppError> {
let repo = self
.utils_find_repo(namespace.clone(), repo_name.clone(), ctx)
.await?;
let rev = query.r#ref.unwrap_or_else(|| "HEAD".to_string());
let tree_oid: git::CommitOid = {
let rev_clone = rev.clone();
git_spawn!(repo, domain -> {
domain.resolve_rev(&rev_clone)
})?
.into()
};
let (root_blobs, subdirs): (
std::collections::HashMap<String, (String, git::CommitOid)>,
std::collections::HashMap<String, (String, git::CommitOid)>,
) = {
let oid = tree_oid;
git_spawn!(repo, domain -> {
let entries = domain.tree_list(&oid)?;
let mut blobs: std::collections::HashMap<String, (String, git::CommitOid)> =
Default::default();
let mut dirs: std::collections::HashMap<String, (String, git::CommitOid)> =
Default::default();
for entry in entries {
let name_lower = entry.name.to_lowercase();
if entry.kind == "tree" {
dirs.insert(name_lower, (entry.name.clone(), entry.oid));
} else if entry.kind == "blob" {
blobs.insert(name_lower, (entry.name.clone(), entry.oid));
}
}
Ok::<_, AppError>((blobs, dirs))
})?
};
let subdir_blobs: std::collections::HashMap<
String,
std::collections::HashMap<String, (String, git::CommitOid)>,
> = {
let repo_clone = repo.clone();
let subdirs_clone = subdirs.clone();
let mut result: std::collections::HashMap<
String,
std::collections::HashMap<String, (String, git::CommitOid)>,
> = Default::default();
for (subdir_lower, (subdir_original, subdir_oid)) in subdirs_clone.clone() {
let interested = matches!(
subdir_lower.as_str(),
"docs" | "doc" | ".github" | "wiki" | "site"
);
if !interested {
continue;
}
let oid = subdir_oid;
let repo_inner = repo_clone.clone();
let entries: std::collections::HashMap<String, (String, git::CommitOid)> = git_spawn!(repo_inner, domain -> {
let entries = domain.tree_list(&oid)?;
Ok::<std::collections::HashMap<String, (String, git::CommitOid)>, AppError>(
entries.into_iter()
.filter(|e| e.kind == "blob")
.map(|e| (e.name.to_lowercase(), (e.name.clone(), e.oid)))
.collect(),
)
})?;
result.insert(subdir_original.clone(), entries);
}
result
};
#[derive(Clone)]
struct Candidate {
path: String,
oid: git::CommitOid,
score: isize,
}
let mut best: Option<Candidate> = None;
for &(variant, is_markdown, is_root) in README_VARIANTS {
let lookup = variant.to_lowercase();
let found: Option<(String, git::CommitOid)> = if is_root {
root_blobs.get(&lookup).map(|(n, o)| (n.clone(), o.clone()))
} else {
lookup.split_once('/').and_then(|(subdir, rest)| {
subdir_blobs.get(subdir).and_then(|subdir_map| {
subdir_map.get(rest).map(|(n, o)| (n.clone(), o.clone()))
})
})
};
let Some((_blob_name, oid)) = found else {
continue;
};
let score = if is_root { 1000 } else { 0 } + if is_markdown { 100 } else { 0 }
- variant.len() as isize;
let better = best.as_ref().map(|b| score > b.score).unwrap_or(true);
if better {
best = Some(Candidate {
path: variant.to_string(),
oid,
score,
});
}
}
let Some(candidate) = best else {
return Ok(GitReadmeResponse {
path: None,
content: None,
size: None,
encoding: None,
truncated: false,
is_binary: false,
});
};
let (raw_bytes, is_binary, total_size) = {
let oid = candidate.oid;
git_spawn!(repo, domain -> {
let content = domain.blob_content(&oid)?;
Ok::<_, AppError>((content.content, content.is_binary, content.size))
})?
};
if is_binary {
return Ok(GitReadmeResponse {
path: Some(candidate.path),
content: None,
size: Some(total_size),
encoding: Some("binary".to_string()),
truncated: false,
is_binary: true,
});
}
let truncated = raw_bytes.len() > README_SIZE_LIMIT;
let to_encode: Vec<u8> = if truncated {
let mut cut = raw_bytes[..README_SIZE_LIMIT].to_vec();
while !cut.is_empty() && std::str::from_utf8(&cut).is_err() {
cut.pop();
}
cut
} else {
raw_bytes
};
let (content_b64, is_binary_final, encoding) = match std::str::from_utf8(&to_encode) {
Ok(_) => (BASE64.encode(&to_encode), false, "base64".to_string()),
Err(_) => (BASE64.encode(&to_encode), true, "binary".to_string()),
};
Ok(GitReadmeResponse {
path: Some(candidate.path),
content: Some(content_b64),
size: Some(total_size),
encoding: Some(encoding),
truncated,
is_binary: is_binary_final,
})
}
pub async fn git_blob_get(
&self,
namespace: String,
repo_name: String,
query: BlobGetQuery,
ctx: &Session,
) -> Result<BlobInfoResponse, AppError> {
let repo = self.utils_find_repo(namespace, repo_name, ctx).await?;
let oid_str = query.oid.clone();
let info = tokio::task::spawn_blocking(move || {
let domain = git::GitDomain::from_model(repo)?;
let oid = git::CommitOid::new(&oid_str);
domain.blob_get(&oid)
})
.await
.map_err(|e| AppError::InternalServerError(format!("Task join error: {}", e)))?
.map_err(AppError::from)?;
Ok(BlobInfoResponse::from(info))
}
pub async fn git_blob_exists(
&self,
namespace: String,
repo_name: String,
query: BlobGetQuery,
ctx: &Session,
) -> Result<BlobExistsResponse, AppError> {
let repo = self.utils_find_repo(namespace, repo_name, ctx).await?;
let oid_str = query.oid.clone();
let exists = tokio::task::spawn_blocking(move || {
let domain = git::GitDomain::from_model(repo)?;
let oid = git::CommitOid::new(&oid_str);
Ok::<_, git::GitError>(domain.blob_exists(&oid))
})
.await
.map_err(|e| AppError::InternalServerError(format!("Task join error: {}", e)))?
.map_err(AppError::from)?;
Ok(BlobExistsResponse {
oid: query.oid,
exists,
})
}
pub async fn git_blob_is_binary(
&self,
namespace: String,
repo_name: String,
query: BlobGetQuery,
ctx: &Session,
) -> Result<BlobIsBinaryResponse, AppError> {
let repo = self.utils_find_repo(namespace, repo_name, ctx).await?;
let oid_str = query.oid.clone();
let is_binary = tokio::task::spawn_blocking(move || {
let domain = git::GitDomain::from_model(repo)?;
let oid = git::CommitOid::new(&oid_str);
domain.blob_is_binary(&oid)
})
.await
.map_err(|e| AppError::InternalServerError(format!("Task join error: {}", e)))?
.map_err(AppError::from)?;
Ok(BlobIsBinaryResponse {
oid: query.oid,
is_binary,
})
}
pub async fn git_blob_content(
&self,
namespace: String,
repo_name: String,
query: BlobGetQuery,
ctx: &Session,
) -> Result<BlobContentResponse, AppError> {
let repo = self
.utils_find_repo(namespace.clone(), repo_name.clone(), ctx)
.await?;
let cache_key = format!("git:blob:{}:{}:{}", namespace, repo_name, query.oid);
if let Ok(mut conn) = self.cache.conn().await {
if let Ok(cached) = conn.get::<_, String>(cache_key.clone()).await {
if let Ok(cached) = serde_json::from_str::<BlobContentResponse>(&cached) {
return Ok(cached);
}
}
}
let repo_clone = repo.clone();
let oid_str = query.oid.clone();
let content = tokio::task::spawn_blocking(move || {
let domain = git::GitDomain::from_model(repo_clone)?;
let oid = git::CommitOid::new(&oid_str);
domain.blob_content(&oid)
})
.await
.map_err(|e| AppError::InternalServerError(format!("Task join error: {}", e)))?
.map_err(AppError::from)?;
let response = BlobContentResponse {
oid: query.oid.clone(),
size: content.size,
is_binary: content.is_binary,
content: BASE64.encode(&content.content),
};
// Only cache blobs smaller than the size limit to prevent memory exhaustion
if response.size < BLOB_CACHE_SIZE_LIMIT {
if let Ok(mut conn) = self.cache.conn().await {
if let Err(e) = conn
.set_ex::<String, String, ()>(
cache_key,
serde_json::to_string(&response).unwrap_or_default(),
60 * 60,
)
.await
{
slog::debug!(self.logs, "cache set failed (non-fatal): {}", e);
}
}
}
Ok(response)
}
pub async fn git_blob_size(
&self,
namespace: String,
repo_name: String,
query: BlobGetQuery,
ctx: &Session,
) -> Result<BlobSizeResponse, AppError> {
let repo = self.utils_find_repo(namespace, repo_name, ctx).await?;
let oid_str = query.oid.clone();
let size = tokio::task::spawn_blocking(move || {
let domain = git::GitDomain::from_model(repo)?;
let oid = git::CommitOid::new(&oid_str);
domain.blob_size(&oid)
})
.await
.map_err(|e| AppError::InternalServerError(format!("Task join error: {}", e)))?
.map_err(AppError::from)?;
Ok(BlobSizeResponse {
oid: query.oid,
size,
})
}
pub async fn git_blob_create(
&self,
namespace: String,
repo_name: String,
request: BlobCreateRequest,
ctx: &Session,
) -> Result<BlobCreateResponse, AppError> {
let repo = self.utils_find_repo(namespace, repo_name, ctx).await?;
let data = BASE64
.decode(&request.data)
.map_err(|_| AppError::InternalServerError("invalid base64 data".to_string()))?;
let repo_clone = repo.clone();
let data_clone = data.clone();
let oid = tokio::task::spawn_blocking(move || {
let domain = git::GitDomain::from_model(repo_clone)?;
domain.blob_create(&data_clone)
})
.await
.map_err(|e| AppError::InternalServerError(format!("Task join error: {}", e)))?
.map_err(AppError::from)?;
Ok(BlobCreateResponse {
oid: oid.to_string(),
size: data.len(),
})
}
}