feat(room): improve robustness — optimistic send, atomic seq, jitter reconnect

Backend:
- Atomic seq assignment via Redis Lua script: INCR + GET run atomically
  inside a Lua script, preventing duplicate seqs under concurrent requests.
  DB reconciliation only triggers on cross-server handoff (rare path).
- Broadcast channel capacity: 10,000 → 100,000 to prevent message drops
  under high-throughput rooms.

Frontend:
- Optimistic sendMessage: adds message to UI immediately (marked
  isOptimistic=true) so user sees it instantly. Replaces with
  server-confirmed message on success, marks as isOptimisticError on
  failure. Fire-and-forget to IndexedDB for persistence.
- Seq-based dedup in onRoomMessage: replaces optimistic message by
  matching seq, preventing duplicates when WS arrives before REST confirm.
- Reconnect jitter: replaced deterministic backoff with full jitter
  (random within backoff window), preventing thundering herd on server
  restart.
- Visual WS status dot in room header: green=connected, amber
  (pulsing)=connecting, red=error/disconnected.
- isPending check extended to cover both old 'temp-' prefix and new
  isOptimistic flag, showing 'Sending...' / 'Failed' badges.
This commit is contained in:
ZhenYi 2026-04-16 19:23:06 +08:00
parent 5482283727
commit c89f01b718
7 changed files with 132 additions and 21 deletions

View File

@ -379,10 +379,12 @@ async fn poll_push_streams(streams: &mut PushStreams) -> Option<WsPushEvent> {
}
return Some(WsPushEvent::RoomMessage { room_id, event });
}
Some(Err(_)) => {
streams.remove(&room_id);
}
None => {
Some(Err(_)) | None => {
// Stream closed/error — remove and re-subscribe to avoid
// spinning on a closed stream. The manager keeps the
// broadcast sender alive so re-subscribing gets the latest
// receiver. Multiple rapid errors are handled by the
// manager's existing retry/cleanup logic.
streams.remove(&room_id);
}
}

View File

@ -16,7 +16,7 @@ use crate::error::RoomError;
use crate::metrics::RoomMetrics;
use crate::types::NotificationEvent;
const BROADCAST_CAPACITY: usize = 10000;
const BROADCAST_CAPACITY: usize = 100_000;
const SHUTDOWN_CHANNEL_CAPACITY: usize = 16;
const CONNECTION_COOLDOWN: Duration = Duration::from_secs(30);
const MAX_CONNECTIONS_PER_ROOM: usize = 50000;

View File

@ -1121,25 +1121,46 @@ impl RoomService {
let mut conn = cache.conn().await.map_err(|e| {
RoomError::Internal(format!("failed to get redis connection for seq: {}", e))
})?;
let seq: i64 = redis::cmd("INCR")
// Atomically increment and check via Lua: INCR first, then if Redis was
// externally set to a higher value, jump to max+1. This prevents concurrent
// requests from getting duplicate seqs — the Lua script runs as one atomic unit.
let seq: i64 = redis::cmd("EVAL")
.arg(
r#"
local current = redis.call('INCR', KEYS[1])
local stored = redis.call('GET', KEYS[1])
if stored and tonumber(stored) > current then
local next = tonumber(stored) + 1
redis.call('SET', KEYS[1], next)
return next
end
return current
"#,
)
.arg(1)
.arg(&seq_key)
.query_async(&mut conn)
.await
.map_err(|e| RoomError::Internal(format!("INCR seq: {}", e)))?;
.map_err(|e| RoomError::Internal(format!("seq Lua script: {}", e)))?;
// Reconciliation check: if DB is ahead of Redis (e.g. server restart wiped
// Redis), bump Redis to stay in sync. This query is only hit on the rare
// cross-server handoff case, not on every request.
use models::rooms::room_message::{Column as RmCol, Entity as RoomMessage};
use sea_orm::EntityTrait;
let db_seq: Option<Option<i64>> = RoomMessage::find()
let db_seq: Option<Option<Option<i64>>> = RoomMessage::find()
.filter(RmCol::Room.eq(room_id))
.select_only()
.column_as(RmCol::Seq.max(), "max_seq")
.into_tuple::<Option<i64>>()
.into_tuple::<Option<Option<i64>>>()
.one(db)
.await?
.map(|r| r);
let db_seq = db_seq.flatten().unwrap_or(0);
let db_seq = db_seq.flatten().flatten().unwrap_or(0);
if db_seq >= seq {
let _: i64 = redis::cmd("SET")
// Another server handled this room while we were idle — catch up.
let _: String = redis::cmd("SET")
.arg(&seq_key)
.arg(db_seq + 1)
.query_async(&mut conn)

View File

@ -277,6 +277,13 @@ export function RoomChatPanel({ room, isAdmin, onClose, onDelete }: RoomChatPane
? (wsError ?? 'Connection error')
: null;
// Visual connection status dot (Discord-style)
const statusDotColor = wsStatus === 'open'
? 'bg-green-500'
: wsStatus === 'connecting'
? 'bg-yellow-400 animate-pulse'
: 'bg-red-500';
const handleSend = useCallback(
(content: string) => {
sendMessage(content, 'text', replyingTo?.id ?? undefined);
@ -380,6 +387,10 @@ export function RoomChatPanel({ room, isAdmin, onClose, onDelete }: RoomChatPane
<div className="flex items-center gap-2">
<Hash className="h-5 w-5 text-muted-foreground" />
<h1 className="truncate text-base font-semibold text-foreground">{room.room_name}</h1>
<span
className={cn('h-2 w-2 rounded-full', statusDotColor)}
title={connectionLabel ?? 'Connected'}
/>
{!room.public && (
<span className="rounded bg-muted px-1.5 py-0.5 text-[11px] text-muted-foreground">
Private

View File

@ -98,7 +98,9 @@ export const RoomMessageBubble = memo(function RoomMessageBubble({
const isOwner = user?.uid === getSenderUserUid(message);
const isRevoked = !!message.revoked;
const isFailed = message.isOptimisticError === true;
const isPending = message.id.startsWith('temp-');
// True for messages that haven't been confirmed by the server yet.
// Handles both the old 'temp-' prefix and the new isOptimistic flag.
const isPending = message.isOptimistic === true || message.id.startsWith('temp-') || message.id.startsWith('optimistic-');
// Get streaming content if available
const displayContent = isStreaming && streamingMessages?.has(message.id)

View File

@ -60,6 +60,8 @@ export type MessageWithMeta = RoomMessageResponse & {
display_content?: string;
is_streaming?: boolean;
isOptimisticError?: boolean;
/** True for messages sent by the current user that haven't been confirmed by the server */
isOptimistic?: boolean;
reactions?: ReactionGroup[];
};
@ -393,9 +395,27 @@ export function RoomProvider({
// Use ref to get current activeRoomId to avoid stale closure
if (payload.room_id === activeRoomIdRef.current) {
setMessages((prev) => {
// Deduplicate by both ID (for normal) and seq (for optimistic replacement)
if (prev.some((m) => m.id === payload.id)) {
return prev;
}
// Also check if there's an optimistic message with the same seq that should be replaced
const optimisticIdx = prev.findIndex(
(m) => m.isOptimistic && m.seq === payload.seq && m.seq !== 0,
);
if (optimisticIdx !== -1) {
// Replace optimistic message with confirmed one
const confirmed: MessageWithMeta = {
...wsMessageToUiMessage(payload),
reactions: prev[optimisticIdx].reactions,
};
const next = [...prev];
next[optimisticIdx] = confirmed;
// Remove optimistic from IDB, save confirmed
deleteMessageFromIdb(prev[optimisticIdx].id).catch(() => {});
saveMessage(confirmed).catch(() => {});
return next;
}
const newMsg = wsMessageToUiMessage(payload);
let updated = [...prev, newMsg];
updated.sort((a, b) => a.seq - b.seq);
@ -768,12 +788,64 @@ export function RoomProvider({
async (content: string, contentType = 'text', inReplyTo?: string) => {
const client = wsClientRef.current;
if (!activeRoomId || !client) return;
await client.messageCreate(activeRoomId, content, {
contentType,
inReplyTo,
});
// Optimistic update: add message immediately so user sees it instantly
const optimisticId = `optimistic-${crypto.randomUUID()}`;
const optimisticMsg: MessageWithMeta = {
id: optimisticId,
room: activeRoomId,
seq: 0,
sender_type: 'member',
sender_id: user?.uid ?? null,
content,
content_type: contentType,
send_at: new Date().toISOString(),
display_content: content,
is_streaming: false,
isOptimistic: true,
thread: inReplyTo,
thread_id: inReplyTo,
in_reply_to: inReplyTo,
reactions: [],
};
setMessages((prev) => [...prev, optimisticMsg]);
// Persist optimistic message to IndexedDB so it's not lost on refresh
saveMessage(optimisticMsg).catch(() => {});
try {
const confirmedMsg = await client.messageCreate(activeRoomId, content, {
contentType,
inReplyTo,
});
// Replace optimistic message with server-confirmed one
setMessages((prev) => {
const without = prev.filter((m) => m.id !== optimisticId);
const confirmed: MessageWithMeta = {
...confirmedMsg,
thread_id: confirmedMsg.thread,
display_content: confirmedMsg.content,
is_streaming: false,
isOptimistic: false,
reactions: [],
};
// Remove optimistic from IDB
deleteMessageFromIdb(optimisticId).catch(() => {});
// Save confirmed to IDB
saveMessage(confirmed).catch(() => {});
return [...without, confirmed];
});
} catch (err) {
// Mark optimistic message as failed
setMessages((prev) =>
prev.map((m) =>
m.id === optimisticId ? { ...m, isOptimisticError: true } : m,
),
);
handleRoomError('Send message', err);
}
},
[activeRoomId],
[activeRoomId, user],
);
const editMessage = useCallback(

View File

@ -932,10 +932,13 @@ export class RoomWsClient {
private scheduleReconnect(): void {
if (!this.shouldReconnect) return;
const delay = Math.min(
this.reconnectBaseDelay * Math.pow(2, this.reconnectAttempt),
this.reconnectMaxDelay,
);
// Exponential backoff with full jitter (uniform random within the backoff window).
// Without jitter, all disconnected clients reconnect at exactly the same time
// (thundering herd) after a server restart, overwhelming it.
const baseDelay = this.reconnectBaseDelay * Math.pow(2, this.reconnectAttempt);
const cappedDelay = Math.min(baseDelay, this.reconnectMaxDelay);
const jitter = Math.random() * cappedDelay;
const delay = Math.floor(jitter);
this.reconnectAttempt++;
this.reconnectTimer = setTimeout(() => {