gitdataai/lib/git/sync/language.rs
2026-05-30 01:38:40 +08:00

177 lines
5.7 KiB
Rust

use std::collections::HashMap;
use db::{database::AppDatabase, sqlx};
use uuid::Uuid;
use crate::{bare::GitBare, cmd::oid::ObjectId, errors::GitError};
fn language_from_extension(ext: &str) -> Option<&str> {
match ext {
"rs" => Some("Rust"),
"ts" | "tsx" => Some("TypeScript"),
"js" | "jsx" | "mjs" | "cjs" => Some("JavaScript"),
"py" | "pyi" => Some("Python"),
"go" => Some("Go"),
"java" => Some("Java"),
"kt" | "kts" => Some("Kotlin"),
"c" | "h" => Some("C"),
"cpp" | "cc" | "cxx" | "hpp" | "hxx" => Some("C++"),
"cs" => Some("C#"),
"rb" => Some("Ruby"),
"php" => Some("PHP"),
"swift" => Some("Swift"),
"scala" => Some("Scala"),
"lua" => Some("Lua"),
"r" | "R" => Some("R"),
"sql" => Some("SQL"),
"sh" | "bash" => Some("Shell"),
"ps1" => Some("PowerShell"),
"dart" => Some("Dart"),
"el" | "lisp" => Some("Emacs Lisp"),
"clj" | "cljs" => Some("Clojure"),
"hs" => Some("Haskell"),
"ex" | "exs" => Some("Elixir"),
"erl" => Some("Erlang"),
"vue" => Some("Vue"),
"svelte" => Some("Svelte"),
"css" | "scss" | "sass" | "less" => Some("CSS"),
"html" | "htm" => Some("HTML"),
"xml" | "xsl" | "xsd" => Some("XML"),
"json" | "jsonl" => Some("JSON"),
"yaml" | "yml" => Some("YAML"),
"toml" => Some("TOML"),
"md" | "markdown" => Some("Markdown"),
"dockerfile" => Some("Dockerfile"),
"proto" => Some("Protocol Buffers"),
"tf" => Some("HCL"),
"zig" => Some("Zig"),
"nim" => Some("Nim"),
"v" => Some("V"),
"wasm" => Some("WebAssembly"),
"glsl" => Some("GLSL"),
"cu" | "cuh" => Some("CUDA"),
"makefile" => Some("Makefile"),
_ => None,
}
}
fn language_from_filename(name: &str) -> Option<&str> {
let lower = name.to_ascii_lowercase();
match lower.as_str() {
"makefile" | "gnumakefile" => Some("Makefile"),
"dockerfile" => Some("Dockerfile"),
"cmakelists.txt" => Some("CMake"),
"cargo.toml" => Some("TOML"),
"package.json" => Some("JSON"),
"tsconfig.json" => Some("JSON"),
".gitignore" | ".gitattributes" => Some("Gitignore"),
_ => None,
}
}
fn collect_language_stats(bare: &GitBare) -> Result<HashMap<String, u64>, GitError> {
let repo = bare.gix_repo()?;
let head_id = repo.head_id()
.map_err(|e| GitError::Internal(format!("failed to resolve HEAD: {}", e)))?;
let commit = repo.find_commit(head_id.detach())
.map_err(|e| GitError::Internal(format!("failed to find HEAD commit: {}", e)))?;
let decoded = commit.decode()
.map_err(|e| GitError::Internal(format!("failed to decode commit: {}", e)))?;
let tree_oid = ObjectId::new(decoded.tree().to_hex().to_string());
let mut stats: HashMap<String, u64> = HashMap::new();
walk_tree(bare, &tree_oid, &mut stats)?;
Ok(stats)
}
fn walk_tree(
bare: &GitBare,
tree_oid: &ObjectId,
stats: &mut HashMap<String, u64>,
) -> Result<(), GitError> {
let entries = bare.tree_entries(tree_oid.clone())?;
for entry in entries {
if entry.kind == crate::cmd::tree::TreeKind::Tree {
walk_tree(bare, &entry.oid, stats)?;
continue;
}
if entry.kind == crate::cmd::tree::TreeKind::LfsPointer {
continue;
}
if entry.is_binary {
continue;
}
let language = language_from_filename(&entry.name)
.or_else(|| {
let ext = entry.name.rsplit('.').next().unwrap_or("");
language_from_extension(ext)
});
if let Some(lang) = language {
let size = blob_size(bare, &entry.oid)?;
*stats.entry(lang.to_string()).or_insert(0) += size;
}
}
Ok(())
}
fn blob_size(bare: &GitBare, oid: &ObjectId) -> Result<u64, GitError> {
let repo = bare.gix_repo()?;
let gix_id: gix::hash::ObjectId = oid.try_into()
.map_err(|e| GitError::Internal(format!("invalid oid: {}", e)))?;
let header = repo.find_header(gix_id)
.map_err(|e| GitError::Internal(format!("blob header not found: {}", e)))?;
Ok(header.size() as u64)
}
pub async fn sync_languages(
db: &AppDatabase,
bare: &GitBare,
repo_id: Uuid,
) -> Result<(), GitError> {
let stats = collect_language_stats(bare)?;
if stats.is_empty() {
return Ok(());
}
let total_bytes: u64 = stats.values().sum();
let pool = db.writer();
let mut tx = pool.begin()
.await
.map_err(|e| GitError::Internal(format!("failed to begin tx: {}", e)))?;
sqlx::query("DELETE FROM repo_language WHERE repo = $1")
.bind(repo_id)
.execute(&mut *tx)
.await
.map_err(|e| GitError::Internal(format!("failed to delete repo_language: {}", e)))?;
for (language, bytes) in &stats {
let percentage = if total_bytes > 0 {
(*bytes as f32 / total_bytes as f32) * 100.0
} else {
0.0
};
sqlx::query(
"INSERT INTO repo_language (repo, language, bytes, percentage) VALUES ($1, $2, $3, $4)"
)
.bind(repo_id)
.bind(language)
.bind(*bytes as i64)
.bind(percentage)
.execute(&mut *tx)
.await
.map_err(|e| GitError::Internal(format!("failed to insert repo_language: {}", e)))?;
}
tx.commit()
.await
.map_err(|e| GitError::Internal(format!("failed to commit tx: {}", e)))?;
tracing::info!(
repo_id = %repo_id,
languages = stats.len(),
total_bytes,
"language stats synced"
);
Ok(())
}