Compare commits
4 Commits
d593354ba9
...
02b7a5beda
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
02b7a5beda | ||
|
|
7eb9c5a7fb | ||
|
|
fd232354cc | ||
|
|
a4dd25304c |
@ -45,13 +45,13 @@ spec:
|
|||||||
containerPort: {{ $svc.service.ssh.port }}
|
containerPort: {{ $svc.service.ssh.port }}
|
||||||
protocol: TCP
|
protocol: TCP
|
||||||
- name: health
|
- name: health
|
||||||
containerPort: 8021
|
containerPort: {{ $svc.livenessProbe.httpGet.port | default 8021 | int }}
|
||||||
protocol: TCP
|
protocol: TCP
|
||||||
{{- if $svc.livenessProbe }}
|
{{- if $svc.livenessProbe }}
|
||||||
livenessProbe:
|
livenessProbe:
|
||||||
httpGet:
|
httpGet:
|
||||||
path: {{ $svc.livenessProbe.path }}
|
path: {{ $svc.livenessProbe.httpGet.path }}
|
||||||
port: {{ $svc.livenessProbe.port }}
|
port: health
|
||||||
initialDelaySeconds: {{ $svc.livenessProbe.initialDelaySeconds }}
|
initialDelaySeconds: {{ $svc.livenessProbe.initialDelaySeconds }}
|
||||||
periodSeconds: {{ $svc.livenessProbe.periodSeconds }}
|
periodSeconds: {{ $svc.livenessProbe.periodSeconds }}
|
||||||
timeoutSeconds: {{ $svc.livenessProbe.timeoutSeconds | default 3 }}
|
timeoutSeconds: {{ $svc.livenessProbe.timeoutSeconds | default 3 }}
|
||||||
@ -60,8 +60,8 @@ spec:
|
|||||||
{{- if $svc.readinessProbe }}
|
{{- if $svc.readinessProbe }}
|
||||||
readinessProbe:
|
readinessProbe:
|
||||||
httpGet:
|
httpGet:
|
||||||
path: {{ $svc.readinessProbe.path }}
|
path: {{ $svc.readinessProbe.httpGet.path }}
|
||||||
port: {{ $svc.readinessProbe.port }}
|
port: health
|
||||||
initialDelaySeconds: {{ $svc.readinessProbe.initialDelaySeconds }}
|
initialDelaySeconds: {{ $svc.readinessProbe.initialDelaySeconds }}
|
||||||
periodSeconds: {{ $svc.readinessProbe.periodSeconds }}
|
periodSeconds: {{ $svc.readinessProbe.periodSeconds }}
|
||||||
timeoutSeconds: {{ $svc.readinessProbe.timeoutSeconds | default 3 }}
|
timeoutSeconds: {{ $svc.readinessProbe.timeoutSeconds | default 3 }}
|
||||||
|
|||||||
302
docs/metrics.md
Normal file
302
docs/metrics.md
Normal file
@ -0,0 +1,302 @@
|
|||||||
|
# 监控指标文档
|
||||||
|
|
||||||
|
## 概览
|
||||||
|
|
||||||
|
| 服务 | 端点 | 端口 | 说明 |
|
||||||
|
|------|------|------|------|
|
||||||
|
| app | `GET /health` | 8080 | 主站 API (含 `/metrics`) |
|
||||||
|
| static | `GET /health` | 8081 | 静态文件服务 (avatar, blob, media) |
|
||||||
|
| gitserver | `GET /health` | 8021 | Git 智能协议 (HTTP + SSH) |
|
||||||
|
| git-hook | `GET /health` | 8083 | Hook worker (含 `/metrics`) |
|
||||||
|
| email-worker | `GET /health` | 8084 | 邮件队列 worker (含 `/metrics`) |
|
||||||
|
|
||||||
|
## 健康检查返回格式
|
||||||
|
|
||||||
|
所有服务的 `/health` 返回格式一致:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{ "status": "ok", "db": "ok", "cache": "ok" }
|
||||||
|
```
|
||||||
|
|
||||||
|
- `db: ok` — PostgreSQL 可用
|
||||||
|
- `cache: ok` — Redis 可用
|
||||||
|
- 任一不可用时返回 HTTP `503`,`status: unhealthy`
|
||||||
|
|
||||||
|
## Prometheus 采集配置
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: 'gitdata-app'
|
||||||
|
static_configs:
|
||||||
|
- targets: ['app:8080']
|
||||||
|
metrics_path: '/metrics'
|
||||||
|
|
||||||
|
- job_name: 'gitdata-git-hook'
|
||||||
|
static_configs:
|
||||||
|
- targets: ['git-hook:8083']
|
||||||
|
metrics_path: '/metrics'
|
||||||
|
|
||||||
|
- job_name: 'gitdata-email-worker'
|
||||||
|
static_configs:
|
||||||
|
- targets: ['email-worker:8084']
|
||||||
|
metrics_path: '/metrics'
|
||||||
|
```
|
||||||
|
|
||||||
|
> 注:gitserver 和 static-server 目前未暴露 `/metrics`,仅用于健康检查。
|
||||||
|
|
||||||
|
## 指标列表
|
||||||
|
|
||||||
|
### git-hook 指标 (`GET :8083/metrics`)
|
||||||
|
|
||||||
|
| 指标名 | 类型 | 标签 | 说明 |
|
||||||
|
|--------|------|------|------|
|
||||||
|
| `hook_tasks_total` | Counter | `task_type` | 各类型任务总执行次数 |
|
||||||
|
| `hook_tasks_success_total` | Counter | `task_type` | 成功完成的任务数 |
|
||||||
|
| `hook_tasks_failed_total` | Counter | `task_type` | 失败的任务数 (不含重试) |
|
||||||
|
| `hook_tasks_locked_total` | Counter | — | 仓库被其他 worker 锁定,重新入队的次数 |
|
||||||
|
| `hook_tasks_retried_total` | Counter | — | 触发重试的次数 |
|
||||||
|
| `hook_tasks_exhausted_total` | Counter | — | 重试耗尽后被丢弃的任务数 |
|
||||||
|
| `hook_sync_branches_changed_total` | Counter | — | 同步时产生的分支变更总数 |
|
||||||
|
| `hook_sync_tags_changed_total` | Counter | — | 同步时产生的标签变更总数 |
|
||||||
|
|
||||||
|
**`task_type` 标签值**:
|
||||||
|
- `Sync` — 完整同步 (refs + commits + tags + LFS + fsck + gc + skills)
|
||||||
|
- `Fsck` — 仅校验仓库完整性
|
||||||
|
- `Gc` — 仅垃圾回收
|
||||||
|
|
||||||
|
**PromQL 示例**:
|
||||||
|
```promql
|
||||||
|
# 任务成功率
|
||||||
|
hook_tasks_success_total / hook_tasks_total
|
||||||
|
|
||||||
|
# Sync 任务失败率
|
||||||
|
rate(hook_tasks_failed_total{task_type="Sync"}[5m])
|
||||||
|
|
||||||
|
# 仓库锁定频率
|
||||||
|
rate(hook_tasks_locked_total[15m])
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### email-worker 指标 (`GET :8084/metrics`)
|
||||||
|
|
||||||
|
| 指标名 | 类型 | 说明 |
|
||||||
|
|--------|------|------|
|
||||||
|
| `email_queued_total` | Counter | 写入 Redis Stream 的邮件总数 (生产端) |
|
||||||
|
| `email_consumed_total` | Counter | 从队列消费的邮件总数 |
|
||||||
|
| `email_batch_size` | Counter | 消费批次大小累计值 |
|
||||||
|
| `email_validation_skipped_total` | Counter | 收件人地址校验失败被跳过的邮件数 |
|
||||||
|
| `email_build_errors_total` | Counter | 邮件消息构建失败的次数 |
|
||||||
|
| `email_send_attempts_total` | Counter | SMTP 发送尝试总次数 (含重试) |
|
||||||
|
| `email_sent_total` | Counter | 成功发送的邮件数 |
|
||||||
|
| `email_send_failures_total` | Counter | 经 3 次重试后最终失败的邮件数 |
|
||||||
|
|
||||||
|
**PromQL 示例**:
|
||||||
|
```promql
|
||||||
|
# 邮件发送成功率
|
||||||
|
email_sent_total / (email_sent_total + email_send_failures_total)
|
||||||
|
|
||||||
|
# 校验失败率
|
||||||
|
rate(email_validation_skipped_total[5m]) / rate(email_queued_total[5m])
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### app 指标 (`GET :8080/metrics`)
|
||||||
|
|
||||||
|
由 `observability` crate 导出:
|
||||||
|
|
||||||
|
| 指标名 | 类型 | 说明 |
|
||||||
|
|--------|------|------|
|
||||||
|
| `ai_calls_total` | Counter | AI 对话调用总次数 |
|
||||||
|
| `ai_calls_success` | Counter | AI 调用成功次数 |
|
||||||
|
| `ai_calls_failure` | Counter | AI 调用失败次数 |
|
||||||
|
| `ai_input_tokens_total` | Counter | 累计输入 token 数 |
|
||||||
|
| `ai_output_tokens_total` | Counter | 累计输出 token 数 |
|
||||||
|
| `ai_function_calls_total` | Counter | AI function/tool 调用次数 |
|
||||||
|
| `http_requests_total` | Counter | HTTP 请求总数 (via `service="app"`) |
|
||||||
|
| `http_request_duration_ms_total` | Counter | HTTP 请求累计耗时 (ms) |
|
||||||
|
| `http_requests_by_status_class` | Gauge | 按状态码分类的请求数 (`2xx`, `4xx`, `5xx`) |
|
||||||
|
|
||||||
|
## Kubernetes 探针配置
|
||||||
|
|
||||||
|
> 以下为 values.yaml 中实际生效的配置值。Helm 模板使用具名端口 `health` 引用容器端口。
|
||||||
|
|
||||||
|
### app
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: 8080
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 10
|
||||||
|
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: 8080
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 5
|
||||||
|
|
||||||
|
startupProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: 8080
|
||||||
|
initialDelaySeconds: 0
|
||||||
|
periodSeconds: 10
|
||||||
|
failureThreshold: 30
|
||||||
|
```
|
||||||
|
|
||||||
|
### static
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: 8081
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 10
|
||||||
|
timeoutSeconds: 3
|
||||||
|
failureThreshold: 3
|
||||||
|
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: 8081
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 5
|
||||||
|
timeoutSeconds: 3
|
||||||
|
failureThreshold: 3
|
||||||
|
```
|
||||||
|
|
||||||
|
### gitserver
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: health # 具名端口,映射到容器 8021
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 10
|
||||||
|
timeoutSeconds: 3
|
||||||
|
failureThreshold: 3
|
||||||
|
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: health
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 5
|
||||||
|
timeoutSeconds: 3
|
||||||
|
failureThreshold: 3
|
||||||
|
```
|
||||||
|
|
||||||
|
### git-hook
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: 8083
|
||||||
|
initialDelaySeconds: 10
|
||||||
|
periodSeconds: 15
|
||||||
|
timeoutSeconds: 5
|
||||||
|
failureThreshold: 3
|
||||||
|
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: 8083
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 10
|
||||||
|
timeoutSeconds: 3
|
||||||
|
failureThreshold: 3
|
||||||
|
```
|
||||||
|
|
||||||
|
### email-worker
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: 8084
|
||||||
|
initialDelaySeconds: 10
|
||||||
|
periodSeconds: 30
|
||||||
|
timeoutSeconds: 5
|
||||||
|
failureThreshold: 3
|
||||||
|
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: 8084
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 15
|
||||||
|
timeoutSeconds: 3
|
||||||
|
failureThreshold: 3
|
||||||
|
```
|
||||||
|
|
||||||
|
## 告警规则示例
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Git Hook Worker Down
|
||||||
|
- alert: GitHookDown
|
||||||
|
expr: up{job="gitdata-git-hook"} == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Git Hook worker 无法访问"
|
||||||
|
|
||||||
|
# Git Hook 任务失败率过高
|
||||||
|
- alert: GitHookHighFailureRate
|
||||||
|
expr: |
|
||||||
|
rate(hook_tasks_failed_total[5m])
|
||||||
|
/ rate(hook_tasks_total[5m]) > 0.1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Git Hook 任务失败率超过 10%"
|
||||||
|
|
||||||
|
# Email Worker Down
|
||||||
|
- alert: EmailWorkerDown
|
||||||
|
expr: up{job="gitdata-email-worker"} == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Email Worker 无法访问"
|
||||||
|
|
||||||
|
# 邮件发送失败率过高
|
||||||
|
- alert: EmailHighFailureRate
|
||||||
|
expr: |
|
||||||
|
rate(email_send_failures_total[5m])
|
||||||
|
/ (rate(email_sent_total[5m]) + rate(email_send_failures_total[5m])) > 0.05
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "邮件发送失败率超过 5%"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Sitemap 服务端点
|
||||||
|
|
||||||
|
| 端点 | 说明 |
|
||||||
|
|------|------|
|
||||||
|
| `GET /sitemap.xml` | sitemapindex,引用所有子 sitemap |
|
||||||
|
| `GET /sidemap/static` | 固定页面 (首页、auth、营销页) |
|
||||||
|
| `GET /sidemap/users` | 公开用户页面 (按用户名字母排序) |
|
||||||
|
| `GET /sidemap/projects` | 公开项目页面 (按项目名字母排序) |
|
||||||
|
| `GET /sidemap/repos` | 公开仓库页面 (按仓库名字母排序) |
|
||||||
|
| `GET /robots.txt` | robots.txt,声明 Sitemap 位置 |
|
||||||
|
|
||||||
|
**缓存策略**:用户/项目/仓库 sitemap 数据通过 Redis 缓存,TTL 8 小时,访问不续期,过期自动重新生成。缓存在 key `sidemap:{users,projects,repos}` 下。
|
||||||
|
|
||||||
|
**域名**:`robots.txt` 中的 Sitemap URL 动态读取 `APP_DOMAIN_URL` 环境变量并强制使用 `https://` 前缀。
|
||||||
|
|
||||||
|
**禁止爬取**:
|
||||||
|
- 主站 `app` (`/robots.txt`) — 禁止 `/api/`、`/health`、`/metrics`、`/ws/` 等,同时声明 `Sitemap: {APP_DOMAIN_URL}/sitemap.xml`
|
||||||
|
- Gitserver (`/robots.txt`) — `Disallow: /` 禁止所有路由,并指向主站 sitemap
|
||||||
|
|
||||||
|
**缓存策略**:用户/项目/仓库 sitemap 数据通过 Redis 缓存,TTL 8 小时,访问不续期,过期自动重新生成。缓存在 key `sidemap:{users,projects,repos}` 下。
|
||||||
@ -20,6 +20,25 @@ pub struct HttpAppState {
|
|||||||
pub cache: AppCache,
|
pub cache: AppCache,
|
||||||
pub sync: crate::ssh::ReceiveSyncService,
|
pub sync: crate::ssh::ReceiveSyncService,
|
||||||
pub rate_limiter: Arc<rate_limit::RateLimiter>,
|
pub rate_limiter: Arc<rate_limit::RateLimiter>,
|
||||||
|
pub config: AppConfig,
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn robots(state: web::Data<HttpAppState>) -> HttpResponse {
|
||||||
|
let sitemap_url = state
|
||||||
|
.config
|
||||||
|
.git_http_domain()
|
||||||
|
.map(|d| format!("{}/sitemap.xml", d.trim_end_matches('/')))
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
let body = if sitemap_url.is_empty() {
|
||||||
|
"User-agent: *\nDisallow: /\n".to_string()
|
||||||
|
} else {
|
||||||
|
format!("User-agent: *\nDisallow: /\n\nSitemap: {sitemap_url}\n")
|
||||||
|
};
|
||||||
|
|
||||||
|
HttpResponse::Ok()
|
||||||
|
.content_type("text/plain; charset=utf-8")
|
||||||
|
.body(body)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn health(state: web::Data<HttpAppState>) -> HttpResponse {
|
async fn health(state: web::Data<HttpAppState>) -> HttpResponse {
|
||||||
@ -49,7 +68,8 @@ async fn health(state: web::Data<HttpAppState>) -> HttpResponse {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn git_http_cfg(cfg: &mut web::ServiceConfig) {
|
pub fn git_http_cfg(cfg: &mut web::ServiceConfig) {
|
||||||
cfg.route("/health", web::get().to(health))
|
cfg.route("/robots.txt", web::get().to(robots))
|
||||||
|
.route("/health", web::get().to(health))
|
||||||
.route(
|
.route(
|
||||||
"/{namespace}/{repo_name}.git/info/refs",
|
"/{namespace}/{repo_name}.git/info/refs",
|
||||||
web::get().to(routes::info_refs),
|
web::get().to(routes::info_refs),
|
||||||
@ -119,6 +139,7 @@ pub async fn run_http(config: AppConfig) -> anyhow::Result<()> {
|
|||||||
cache: app_cache.clone(),
|
cache: app_cache.clone(),
|
||||||
sync,
|
sync,
|
||||||
rate_limiter,
|
rate_limiter,
|
||||||
|
config: config.clone(),
|
||||||
};
|
};
|
||||||
|
|
||||||
tracing::info!("Starting git HTTP server on 0.0.0.0:8021");
|
tracing::info!("Starting git HTTP server on 0.0.0.0:8021");
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user