gitdataai/libs/git/archive/ops.rs
2026-04-15 09:08:09 +08:00

553 lines
19 KiB
Rust

//! Archive operations.
//!
//! Generates .tar, .tar.gz, and .zip archives from git trees with caching support.
use std::fs;
use std::io::{Cursor, Write};
use std::path::PathBuf;
use flate2::Compression;
use flate2::write::GzEncoder;
use crate::archive::types::{ArchiveEntry, ArchiveFormat, ArchiveOptions, ArchiveSummary};
use crate::commit::types::CommitOid;
use crate::{GitDomain, GitError, GitResult};
impl GitDomain {
/// Directory where cached archives are stored.
fn archive_cache_dir(&self) -> PathBuf {
PathBuf::from(self.repo().path()).join(".git-archives")
}
/// Path to the cached archive file for a given commit/format/options.
fn archive_cache_path(
&self,
commit_oid: &CommitOid,
format: ArchiveFormat,
opts: &ArchiveOptions,
) -> PathBuf {
let ext = match format {
ArchiveFormat::Tar => "tar",
ArchiveFormat::TarGz => "tar.gz",
ArchiveFormat::Zip => "zip",
};
let key = opts.cache_key();
self.archive_cache_dir()
.join(format!("{}{}.{}", commit_oid.as_str(), key, ext))
}
/// Ensure the cache directory exists.
fn ensure_archive_cache_dir(&self) -> GitResult<()> {
let dir = self.archive_cache_dir();
if !dir.exists() {
fs::create_dir_all(&dir).map_err(|e| GitError::IoError(e.to_string()))?;
}
Ok(())
}
/// Generate a plain tar archive from a commit's tree.
/// Caches the result after first build.
pub fn archive_tar(
&self,
commit_oid: &CommitOid,
opts: Option<ArchiveOptions>,
) -> GitResult<Vec<u8>> {
let opts = opts.unwrap_or_default();
let cache_path = self.archive_cache_path(commit_oid, ArchiveFormat::Tar, &opts);
if cache_path.exists() {
return fs::read(&cache_path).map_err(|e| GitError::IoError(e.to_string()));
}
let tree = self.tree_from_commit(commit_oid)?;
let mut buf = Vec::new();
let base = opts.prefix.as_deref().unwrap_or("");
self.walk_tar(&mut buf, &tree, base, &opts)?;
self.ensure_archive_cache_dir()?;
fs::write(&cache_path, &buf).map_err(|e| GitError::IoError(e.to_string()))?;
Ok(buf)
}
/// Generate a tar.gz archive from a commit's tree.
/// Caches the result after first build.
pub fn archive_tar_gz(
&self,
commit_oid: &CommitOid,
opts: Option<ArchiveOptions>,
) -> GitResult<Vec<u8>> {
let opts = opts.unwrap_or_default();
let cache_path = self.archive_cache_path(commit_oid, ArchiveFormat::TarGz, &opts);
if cache_path.exists() {
return fs::read(&cache_path).map_err(|e| GitError::IoError(e.to_string()));
}
let tree = self.tree_from_commit(commit_oid)?;
let mut buf = Vec::new();
{
let encoder = GzEncoder::new(&mut buf, Compression::default());
let mut builder = tar::Builder::new(encoder);
let base = opts.prefix.as_deref().unwrap_or("");
self.walk_tar_builder(&mut builder, &tree, base, &opts)?;
let encoder = builder
.into_inner()
.map_err(|e| GitError::Internal(e.to_string()))?;
encoder
.finish()
.map_err(|e| GitError::Internal(e.to_string()))?;
}
self.ensure_archive_cache_dir()?;
fs::write(&cache_path, &buf).map_err(|e| GitError::IoError(e.to_string()))?;
Ok(buf)
}
/// Generate a zip archive from a commit's tree.
/// Caches the result after first build.
pub fn archive_zip(
&self,
commit_oid: &CommitOid,
opts: Option<ArchiveOptions>,
) -> GitResult<Vec<u8>> {
let opts = opts.unwrap_or_default();
let cache_path = self.archive_cache_path(commit_oid, ArchiveFormat::Zip, &opts);
if cache_path.exists() {
return fs::read(&cache_path).map_err(|e| GitError::IoError(e.to_string()));
}
let tree = self.tree_from_commit(commit_oid)?;
let mut zip_buf = Vec::new();
let base = opts.prefix.as_deref().unwrap_or("");
self.walk_zip(&mut zip_buf, &tree, base, &opts)?;
self.ensure_archive_cache_dir()?;
fs::write(&cache_path, &zip_buf).map_err(|e| GitError::IoError(e.to_string()))?;
Ok(zip_buf)
}
/// Generate an archive in the specified format.
/// Results are cached keyed by (commit_oid, format, options).
pub fn archive(
&self,
commit_oid: &CommitOid,
format: ArchiveFormat,
opts: Option<ArchiveOptions>,
) -> GitResult<Vec<u8>> {
match format {
ArchiveFormat::Tar => self.archive_tar(commit_oid, opts),
ArchiveFormat::TarGz => self.archive_tar_gz(commit_oid, opts),
ArchiveFormat::Zip => self.archive_zip(commit_oid, opts),
}
}
/// List all entries that would be included in an archive.
pub fn archive_list(
&self,
commit_oid: &CommitOid,
opts: Option<ArchiveOptions>,
) -> GitResult<Vec<ArchiveEntry>> {
let tree = self.tree_from_commit(commit_oid)?;
let opts = opts.unwrap_or_default();
let mut entries = Vec::new();
self.collect_tree_entries(&mut entries, &tree, "", 0, &opts)?;
Ok(entries)
}
pub fn archive_summary(
&self,
commit_oid: &CommitOid,
format: ArchiveFormat,
opts: Option<ArchiveOptions>,
) -> GitResult<ArchiveSummary> {
let entries = self.archive_list(commit_oid, opts)?;
let total_size: u64 = entries.iter().map(|e| e.size).sum();
Ok(ArchiveSummary {
commit_oid: commit_oid.to_string(),
format,
total_entries: entries.len(),
total_size,
})
}
pub fn archive_cached(
&self,
commit_oid: &CommitOid,
format: ArchiveFormat,
opts: Option<ArchiveOptions>,
) -> bool {
let opts = opts.unwrap_or_default();
self.archive_cache_path(commit_oid, format, &opts).exists()
}
/// Invalidate (delete) a cached archive, if it exists.
/// Call this when you need a fresh build after the repo state changes.
pub fn archive_invalidate(
&self,
commit_oid: &CommitOid,
format: ArchiveFormat,
opts: Option<ArchiveOptions>,
) -> GitResult<bool> {
let opts = opts.unwrap_or_default();
let path = self.archive_cache_path(commit_oid, format, &opts);
if path.exists() {
fs::remove_file(&path).map_err(|e| GitError::IoError(e.to_string()))?;
Ok(true)
} else {
Ok(false)
}
}
/// List all cached archive paths for a given commit.
pub fn archive_cache_list(&self, commit_oid: &CommitOid) -> GitResult<Vec<PathBuf>> {
let dir = self.archive_cache_dir();
if !dir.exists() {
return Ok(Vec::new());
}
let prefix = commit_oid.as_str();
let mut paths = Vec::new();
for entry in fs::read_dir(&dir).map_err(|e| GitError::IoError(e.to_string()))? {
let entry = entry.map_err(|e| GitError::IoError(e.to_string()))?;
let name = entry.file_name();
let name = name.to_string_lossy();
if name.starts_with(prefix) {
paths.push(entry.path());
}
}
Ok(paths)
}
/// Invalidate all cached archives for a given commit.
pub fn archive_invalidate_all(&self, commit_oid: &CommitOid) -> GitResult<usize> {
let paths = self.archive_cache_list(commit_oid)?;
let count = paths.len();
for p in paths {
fs::remove_file(&p).map_err(|e| GitError::IoError(e.to_string()))?;
}
Ok(count)
}
fn tree_from_commit(&self, commit_oid: &CommitOid) -> GitResult<git2::Tree<'_>> {
let oid = commit_oid
.to_oid()
.map_err(|_| GitError::InvalidOid(commit_oid.to_string()))?;
let commit = self
.repo()
.find_commit(oid)
.map_err(|e| GitError::Internal(e.to_string()))?;
self.repo()
.find_tree(commit.tree_id())
.map_err(|e| GitError::Internal(e.to_string()))
}
fn walk_tar(
&self,
buf: &mut Vec<u8>,
tree: &git2::Tree<'_>,
base: &str,
opts: &ArchiveOptions,
) -> GitResult<()> {
for entry in tree.iter() {
let name = entry.name().unwrap_or("");
let full_path = if base.is_empty() {
name.to_string()
} else {
format!("{}/{}", base, name)
};
if !self.entry_passes_filter(&full_path, opts) {
continue;
}
let oid = entry.id();
let obj = match self.repo().find_object(oid, None) {
Ok(o) => o,
Err(_) => continue,
};
let mode = entry.filemode() as u32;
if obj.kind() == Some(git2::ObjectType::Tree) {
if opts
.max_depth
.map_or(true, |d| full_path.matches('/').count() < d)
{
let sub_tree = self
.repo()
.find_tree(oid)
.map_err(|e| GitError::Internal(e.to_string()))?;
self.walk_tar(buf, &sub_tree, &full_path, opts)?;
}
} else {
let blob = match obj.as_blob() {
Some(b) => b,
None => continue,
};
let content = blob.content();
let size = content.len() as u64;
let mut header = [0u8; 512];
let path_bytes = full_path.as_bytes();
// tar USTAR format: prefix (≤155) + "/" + name (≤100) = max 255 bytes.
// Split at the last "/" that keeps prefix ≤ 155. Fall back to truncation error.
const NAME_MAX: usize = 100;
const PREFIX_MAX: usize = 155;
if path_bytes.len() <= NAME_MAX {
// Fits directly in name field.
header[..path_bytes.len()].copy_from_slice(path_bytes);
} else if path_bytes.len() <= PREFIX_MAX + 1 + NAME_MAX {
// Find last "/" that leaves prefix ≤ PREFIX_MAX.
let split_at = path_bytes[..path_bytes.len() - NAME_MAX]
.iter()
.rposition(|&b| b == b'/')
.map(|pos| pos + 1)
.unwrap_or(0);
let prefix_len = split_at;
let name_len = path_bytes.len() - split_at;
if prefix_len > PREFIX_MAX || name_len > NAME_MAX {
return Err(GitError::Internal(format!(
"path too long for tar format: {}",
full_path
)));
}
header[..prefix_len].copy_from_slice(&path_bytes[..prefix_len]);
header[prefix_len..prefix_len + 1].copy_from_slice(b"/");
header[prefix_len + 1..prefix_len + 1 + name_len]
.copy_from_slice(&path_bytes[prefix_len..]);
} else {
return Err(GitError::Internal(format!(
"path too long for tar format: {}",
full_path
)));
}
let mode_octal = format!("{:o}", mode & 0o777);
header[100..108].copy_from_slice(mode_octal.as_bytes());
let size_octal = format!("{:o}", size);
if size_octal.len() > 12 {
return Err(GitError::Internal(format!(
"file size {} exceeds maximum for tar format (12-byte octal field)",
size
)));
}
header[124..136].copy_from_slice(size_octal.as_bytes());
header[136..148].copy_from_slice(b"0 ");
header[148..156].copy_from_slice(b" ");
header[156] = b'0';
header[257..265].copy_from_slice(b"ustar\0");
// Calculate checksum: sum all 512 bytes with checksum field filled with spaces.
let sum: u32 = header.iter().map(|&b| b as u32).sum::<u32>();
// tar spec: 8-byte checksum field, formatted as 6 octal digits + space + null.
let sum_octal = format!("{:06o} \0", sum);
header[148..156].copy_from_slice(sum_octal.as_bytes());
buf.write_all(&header)
.map_err(|e| GitError::IoError(e.to_string()))?;
buf.write_all(content)
.map_err(|e| GitError::IoError(e.to_string()))?;
let written = 512 + content.len();
let padding = (512 - written % 512) % 512;
if padding > 0 {
buf.write_all(&vec![0u8; padding])
.map_err(|e| GitError::IoError(e.to_string()))?;
}
}
}
Ok(())
}
fn walk_tar_builder(
&self,
builder: &mut tar::Builder<GzEncoder<&mut Vec<u8>>>,
tree: &git2::Tree<'_>,
base: &str,
opts: &ArchiveOptions,
) -> GitResult<()> {
for entry in tree.iter() {
let name = entry.name().unwrap_or("");
let full_path = if base.is_empty() {
name.to_string()
} else {
format!("{}/{}", base, name)
};
if !self.entry_passes_filter(&full_path, opts) {
continue;
}
let oid = entry.id();
let obj = match self.repo().find_object(oid, None) {
Ok(o) => o,
Err(_) => continue,
};
let mode = entry.filemode() as u32;
if obj.kind() == Some(git2::ObjectType::Tree) {
if opts
.max_depth
.map_or(true, |d| full_path.matches('/').count() < d)
{
let sub_tree = self
.repo()
.find_tree(oid)
.map_err(|e| GitError::Internal(e.to_string()))?;
self.walk_tar_builder(builder, &sub_tree, &full_path, opts)?;
}
} else {
let blob = match obj.as_blob() {
Some(b) => b,
None => continue,
};
let content = blob.content();
let mut header = tar::Header::new_gnu();
header
.set_path(&full_path)
.map_err(|e| GitError::Internal(e.to_string()))?;
header.set_size(content.len() as u64);
header.set_mode(mode & 0o755);
header.set_cksum();
builder
.append(&header, content)
.map_err(|e| GitError::Internal(e.to_string()))?;
}
}
Ok(())
}
fn walk_zip(
&self,
zip_buf: &mut Vec<u8>,
tree: &git2::Tree<'_>,
base: &str,
opts: &ArchiveOptions,
) -> GitResult<()> {
let cursor = Cursor::new(zip_buf);
let mut zip = zip::ZipWriter::new(cursor);
zip = self.walk_zip_impl(zip, tree, base, opts)?;
let _cursor = zip
.finish()
.map_err(|e| GitError::Internal(e.to_string()))?;
Ok(())
}
fn walk_zip_impl<'a>(
&'a self,
mut zip: zip::ZipWriter<Cursor<&'a mut Vec<u8>>>,
tree: &git2::Tree<'_>,
base: &str,
opts: &ArchiveOptions,
) -> GitResult<zip::ZipWriter<Cursor<&'a mut Vec<u8>>>> {
for entry in tree.iter() {
let name = entry.name().unwrap_or("");
let full_path = if base.is_empty() {
name.to_string()
} else {
format!("{}/{}", base, name)
};
if !self.entry_passes_filter(&full_path, opts) {
continue;
}
let oid = entry.id();
let obj = match self.repo().find_object(oid, None) {
Ok(o) => o,
Err(_) => continue,
};
let mode = entry.filemode() as u32;
if obj.kind() == Some(git2::ObjectType::Tree) {
if opts
.max_depth
.map_or(true, |d| full_path.matches('/').count() < d)
{
let sub_tree = self
.repo()
.find_tree(oid)
.map_err(|e| GitError::Internal(e.to_string()))?;
zip = self.walk_zip_impl(zip, &sub_tree, &full_path, opts)?;
}
} else {
let blob = match obj.as_blob() {
Some(b) => b,
None => continue,
};
let content = blob.content();
let options = zip::write::SimpleFileOptions::default()
.compression_method(zip::CompressionMethod::Deflated)
.unix_permissions(mode & 0o755);
zip.start_file(&full_path, options)
.map_err(|e| GitError::Internal(e.to_string()))?;
zip.write_all(content)
.map_err(|e| GitError::Internal(e.to_string()))?;
}
}
Ok(zip)
}
fn collect_tree_entries(
&self,
entries: &mut Vec<ArchiveEntry>,
tree: &git2::Tree<'_>,
prefix: &str,
depth: usize,
opts: &ArchiveOptions,
) -> GitResult<()> {
for entry in tree.iter() {
let name = entry.name().unwrap_or("");
let full_path = if prefix.is_empty() {
name.to_string()
} else {
format!("{}/{}", prefix, name)
};
if !self.entry_passes_filter(&full_path, opts) {
continue;
}
if opts.max_depth.map_or(false, |d| depth >= d) {
continue;
}
let oid = entry.id();
let obj = match self.repo().find_object(oid, None) {
Ok(o) => o,
Err(_) => continue,
};
let mode = entry.filemode() as u32;
let size = obj.as_blob().map(|b| b.size() as u64).unwrap_or(0);
if obj.kind() == Some(git2::ObjectType::Tree) {
let sub_tree = self
.repo()
.find_tree(oid)
.map_err(|e| GitError::Internal(e.to_string()))?;
self.collect_tree_entries(entries, &sub_tree, &full_path, depth + 1, opts)?;
} else {
entries.push(ArchiveEntry {
path: full_path,
oid: oid.to_string(),
size,
mode,
});
}
}
Ok(())
}
fn entry_passes_filter(&self, full_path: &str, opts: &ArchiveOptions) -> bool {
if let Some(ref filter) = opts.path_filter {
if !full_path.starts_with(filter) {
return false;
}
}
true
}
}