From 9dc6e9749ce1c0c45451c863016d1c002fda992c Mon Sep 17 00:00:00 2001 From: Alex Mickelson Date: Mon, 23 Feb 2026 15:13:16 -0700 Subject: [PATCH] works eventually, still shaky --- .vscode/extensions.json | 3 + DISTRIBUTED_SETUP.md | 236 ------------------------------------- client/src/App.tsx | 138 +++++++++------------- docker-compose.yml | 29 +++++ nginx-lb.conf | 43 +++++++ otel-collector-config.yaml | 45 +++++++ 6 files changed, 178 insertions(+), 316 deletions(-) create mode 100644 .vscode/extensions.json delete mode 100644 DISTRIBUTED_SETUP.md create mode 100644 nginx-lb.conf create mode 100644 otel-collector-config.yaml diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 0000000..4765b2d --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,3 @@ +{ + "recommendations": ["jakebecker.elixir-ls"] +} diff --git a/DISTRIBUTED_SETUP.md b/DISTRIBUTED_SETUP.md deleted file mode 100644 index 19ae8ad..0000000 --- a/DISTRIBUTED_SETUP.md +++ /dev/null @@ -1,236 +0,0 @@ -# Distributed Phoenix WebSocket Game - -This project demonstrates a distributed Phoenix application with automatic failover using native Erlang clustering. - -## Architecture - -- **3 Phoenix Nodes**: Running in Docker containers, forming a distributed Erlang cluster -- **Global Process Registry**: Uses `:global` to ensure only one `GameState` GenServer runs across the cluster -- **Automatic Failover**: If a node goes down, another node automatically takes over the GameState -- **Nginx Load Balancer**: Routes WebSocket connections to healthy nodes -- **Client Failover**: Frontend automatically switches to another server if connection is lost - -## How It Works - -### Distributed Erlang Clustering - -- Each Phoenix container starts with a unique node name (e.g., `backend@phoenix1`) -- All nodes share the same Erlang cookie for authentication -- `Backend.Cluster` module automatically connects nodes on startup -- Nodes use EPMD (Erlang Port Mapper Daemon) for discovery - -### Singleton Game State - -- `Backend.GameState` is registered globally using `{:global, __MODULE__}` -- Only ONE instance runs across all nodes at any time -- If the node running GameState crashes, Erlang automatically starts it on another node -- All nodes can communicate with the GameState regardless of where it's running - -### Client Failover - -- Frontend maintains a list of all backend servers -- Automatically reconnects to the next server if connection fails -- Uses exponential backoff and retry logic -- Displays current connection status - -## Setup - -### Prerequisites - -- Docker and Docker Compose -- Or: Elixir 1.15+, Erlang 26+, Node.js 18+ - -### Running with Docker - -1. Build and start all services: - -```bash -docker-compose up --build -``` - -This starts: -- `phoenix1` on port 4001 -- `phoenix2` on port 4002 -- `phoenix3` on port 4003 -- `nginx` load balancer on port 4000 - -2. Open the client (in a separate terminal): - -```bash -cd client -pnpm install -pnpm dev -``` - -3. Open http://localhost:5173 in your browser - -### Running Locally (Development) - -Terminal 1 - Backend Node 1: -```bash -cd backend -mix deps.get -export RELEASE_NODE=backend@127.0.0.1 -export RELEASE_COOKIE=mycookie -export PORT=4001 -export CLUSTER_NODES="backend@127.0.0.1" -iex --name backend@127.0.0.1 --cookie mycookie -S mix phx.server -``` - -Terminal 2 - Backend Node 2: -```bash -cd backend -export RELEASE_NODE=backend@127.0.0.2 -export RELEASE_COOKIE=mycookie -export PORT=4002 -export CLUSTER_NODES="backend@127.0.0.1,backend@127.0.0.2" -iex --name backend@127.0.0.2 --cookie mycookie -S mix phx.server -``` - -Terminal 3 - Frontend: -```bash -cd client -pnpm install -pnpm dev -``` - -## Testing Failover - -### Test 1: Stop a node - -```bash -# Stop one container -docker-compose stop phoenix1 - -# The game continues running on phoenix2 or phoenix3 -# Clients automatically reconnect to available nodes -``` - -### Test 2: Kill the node running GameState - -1. Find which node is running GameState: -```bash -docker-compose exec phoenix1 /app/bin/backend remote -# In the IEx shell: -:global.whereis_name(Backend.GameState) -# This shows {pid, node_name} -``` - -2. Stop that specific node: -```bash -docker-compose stop phoenix2 # or whichever node is running it -``` - -3. The GameState automatically starts on another node -4. All players remain in the game - -### Test 3: Network partition - -```bash -# Disconnect a node from the network -docker network disconnect websocket-testing_app_net phoenix3 - -# Reconnect it -docker network connect websocket-testing_app_net phoenix3 -``` - -## Monitoring the Cluster - -### Check connected nodes - -```bash -docker-compose exec phoenix1 /app/bin/backend remote -``` - -In the IEx shell: -```elixir -# List all connected nodes -Node.list() - -# Check which node is running GameState -:global.whereis_name(Backend.GameState) - -# Get current game state -Backend.GameState.get_state() - -# Check registered global processes -:global.registered_names() -``` - -### View logs - -```bash -# All containers -docker-compose logs -f - -# Specific container -docker-compose logs -f phoenix1 -``` - -## Configuration - -### Environment Variables - -- `RELEASE_NODE`: Node name (e.g., `backend@phoenix1`) -- `RELEASE_COOKIE`: Erlang cookie for cluster authentication -- `CLUSTER_NODES`: Comma-separated list of nodes to connect to -- `PORT`: HTTP port for Phoenix endpoint -- `SECRET_KEY_BASE`: Phoenix secret key - -### Scaling - -To add more nodes, edit `docker-compose.yml`: - -```yaml -phoenix4: - # Same config as phoenix1-3, with unique: - # - container_name: phoenix4 - # - hostname: phoenix4 - # - RELEASE_NODE: backend@phoenix4 - # - ports: "4004:4000" - # - ipv4_address: 172.25.0.14 -``` - -Update `CLUSTER_NODES` in all services to include `backend@phoenix4`. - -## How to Play - -- Use **WASD** keys to move your player -- Your player is shown in red, others in blue -- The game state is shared across all nodes -- Try killing nodes to see failover in action! - -## Troubleshooting - -### Nodes not connecting - -1. Check all nodes have the same `RELEASE_COOKIE` -2. Verify EPMD is running: `docker-compose exec phoenix1 epmd -names` -3. Check firewall allows ports 4369 (EPMD) and 9000-9100 (distributed Erlang) - -### GameState not starting - -1. Check logs: `docker-compose logs -f` -2. Verify only one instance exists globally: `:global.registered_names()` -3. Restart all nodes: `docker-compose restart` - -### Frontend not connecting - -1. Check nginx is running: `docker-compose ps nginx` -2. Verify at least one Phoenix node is healthy -3. Check browser console for connection errors -4. Try connecting directly to a node: http://localhost:4001 - -## Production Considerations - -- **Change the Erlang cookie**: Use a strong secret -- **Use proper SSL/TLS**: Configure HTTPS for WebSocket connections -- **Add health checks**: Monitor node health and GameState availability -- **Persistent storage**: Add database for game state persistence -- **Rate limiting**: Protect against abuse -- **Monitoring**: Add Prometheus/Grafana for metrics -- **Logging**: Centralize logs with ELK or similar - -## License - -MIT diff --git a/client/src/App.tsx b/client/src/App.tsx index 3fb64c7..d29500a 100644 --- a/client/src/App.tsx +++ b/client/src/App.tsx @@ -11,106 +11,86 @@ interface GameState { [playerId: string]: Player; } -// List of WebSocket servers - we'll connect to all of them -const WS_SERVERS = [ - "ws://localhost:4001/socket", - "ws://localhost:4002/socket", - "ws://localhost:4003/socket", -]; +// Connect to nginx load balancer +const WS_SERVER = "ws://localhost:4000/socket"; function App() { const [players, setPlayers] = useState({}); const [myPlayerId, setMyPlayerId] = useState(null); const [connectionStatus, setConnectionStatus] = useState("connecting"); - const socketsRef = useRef([]); - const channelsRef = useRef([]); + const socketRef = useRef(null); + const channelRef = useRef(null); const keysPressed = useRef>(new Set()); useEffect(() => { - // Connect to all servers concurrently - const sockets = WS_SERVERS.map((serverUrl) => { - console.log(`Connecting to ${serverUrl}`); + // Connect to nginx load balancer + console.log(`Connecting to ${WS_SERVER}`); - const socket = new Socket(serverUrl, { - timeout: 3000, - reconnectAfterMs: () => 2000, // Keep trying to reconnect - }); - - // Handle connection events - socket.onOpen(() => { - console.log(`✓ Connected to ${serverUrl}`); - updateConnectionStatus(); - }); - - socket.onError((error) => { - console.error(`✗ Error on ${serverUrl}:`, error); - updateConnectionStatus(); - }); - - socket.onClose(() => { - console.log(`✗ Disconnected from ${serverUrl}`); - updateConnectionStatus(); - }); - - socket.connect(); - return socket; + const socket = new Socket(WS_SERVER, { + timeout: 3000, + reconnectAfterMs: (tries) => + [1000, 2000, 5000, 10000][tries - 1] || 10000, }); - socketsRef.current = sockets; + socket.onOpen(() => { + console.log(`✓ Connected to load balancer`); + setConnectionStatus("Connected"); + }); - // Join game channel on all connected sockets - const channels = sockets.map((socket, index) => { - const channel = socket.channel("game:lobby", {}); + socket.onError((error) => { + console.error(`✗ Connection error:`, error); + setConnectionStatus("Connection error"); + }); - channel - .join() - .receive("ok", () => { - console.log(`✓ Joined channel on ${WS_SERVERS[index]}`); - updateConnectionStatus(); - }) - .receive("error", (resp) => { - console.log(`✗ Failed to join on ${WS_SERVERS[index]}:`, resp); - }) - .receive("timeout", () => { - console.log(`✗ Timeout joining on ${WS_SERVERS[index]}`); - }); + socket.onClose(() => { + console.log(`✗ Disconnected from load balancer`); + setConnectionStatus("Disconnected - reconnecting..."); + }); - // Listen for game state updates from any server - channel.on("game_state", (payload: { players: GameState }) => { - setPlayers(payload.players); + socket.connect(); + socketRef.current = socket; - // Set our player ID from the first state update if not set - if (!myPlayerId && Object.keys(payload.players).length > 0) { - const playerIds = Object.keys(payload.players); - if (playerIds.length > 0) { - setMyPlayerId(playerIds[playerIds.length - 1]); - } + // Join game channel + const channel = socket.channel("game:lobby", {}); + + channel + .join() + .receive("ok", () => { + console.log(`✓ Joined game channel`); + setConnectionStatus("Connected & playing"); + }) + .receive("error", (resp) => { + console.log(`✗ Failed to join:`, resp); + setConnectionStatus("Failed to join game"); + }) + .receive("timeout", () => { + console.log(`✗ Timeout joining`); + setConnectionStatus("Connection timeout"); + }); + + // Listen for game state updates + channel.on("game_state", (payload: { players: GameState }) => { + setPlayers(payload.players); + + if (!myPlayerId && Object.keys(payload.players).length > 0) { + const playerIds = Object.keys(payload.players); + if (playerIds.length > 0) { + setMyPlayerId(playerIds[playerIds.length - 1]); } - }); - - return channel; + } }); - channelsRef.current = channels; - - const updateConnectionStatus = () => { - const joined = channels.filter((c) => c.state === "joined").length; - setConnectionStatus(`${joined}/${WS_SERVERS.length} servers active`); - }; - - // Periodic status update - const statusInterval = setInterval(updateConnectionStatus, 1000); + channelRef.current = channel; // Cleanup on unmount return () => { - clearInterval(statusInterval); - channels.forEach((channel) => channel.leave()); - sockets.forEach((socket) => socket.disconnect()); + channel.leave(); + socket.disconnect(); }; }, [myPlayerId]); - // Handle keyboard input - send to first available channel + // Handle keyboard input - send to active channel useEffect(() => { const handleKeyDown = (e: KeyboardEvent) => { const key = e.key.toLowerCase(); @@ -121,11 +101,9 @@ function App() { if (!keysPressed.current.has(key)) { keysPressed.current.add(key); - // Send to first joined channel (they all share same game state) - const activeChannel = channelsRef.current.find( - (c) => c.state === "joined", - ); - if (activeChannel) { + // Send to active channel + const activeChannel = channelRef.current; + if (activeChannel && activeChannel.state === "joined") { activeChannel.push("move", { direction: key }); } } diff --git a/docker-compose.yml b/docker-compose.yml index a773e52..0b4061f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -17,6 +17,7 @@ services: - DATABASE_URL=ecto://postgres:postgres@db/backend_dev - SECRET_KEY_BASE=W8nGKNhNR8vKj6A4VnwN5h5h7RZvkKmZPqxqzLzYxXGQqC6HnKp2Wm8MNqKpQxZv - CLUSTER_NODES=backend@phoenix1,backend@phoenix2,backend@phoenix3 + - OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 ports: - "4001:4000" healthcheck: @@ -41,6 +42,7 @@ services: - DATABASE_URL=ecto://postgres:postgres@db/backend_dev - SECRET_KEY_BASE=W8nGKNhNR8vKj6A4VnwN5h5h7RZvkKmZPqxqzLzYxXGQqC6HnKp2Wm8MNqKpQxZv - CLUSTER_NODES=backend@phoenix1,backend@phoenix2,backend@phoenix3 + - OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 ports: - "4002:4000" healthcheck: @@ -65,6 +67,7 @@ services: - DATABASE_URL=ecto://postgres:postgres@db/backend_dev - SECRET_KEY_BASE=W8nGKNhNR8vKj6A4VnwN5h5h7RZvkKmZPqxqzLzYxXGQqC6HnKp2Wm8MNqKpQxZv - CLUSTER_NODES=backend@phoenix1,backend@phoenix2,backend@phoenix3 + - OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 ports: - "4003:4000" healthcheck: @@ -73,6 +76,18 @@ services: timeout: 5s retries: 5 + nginx-lb: + image: nginx:alpine + container_name: nginx-lb + volumes: + - ./nginx-lb.conf:/etc/nginx/conf.d/default.conf + ports: + - "4000:80" + depends_on: + - phoenix1 + - phoenix2 + - phoenix3 + client: build: context: ./client @@ -80,3 +95,17 @@ services: container_name: client ports: - "5173:80" + depends_on: + - nginx-lb + + otel-collector: + image: otel/opentelemetry-collector-contrib:latest + container_name: otel-collector + command: ["--config=/etc/otel-collector-config.yaml"] + volumes: + - ./otel-collector-config.yaml:/etc/otel-collector-config.yaml + ports: + - "4318:4318" # OTLP HTTP receiver + - "4317:4317" # OTLP gRPC receiver + - "8888:8888" # Prometheus metrics + - "8889:8889" # Prometheus exporter metrics diff --git a/nginx-lb.conf b/nginx-lb.conf new file mode 100644 index 0000000..83879ea --- /dev/null +++ b/nginx-lb.conf @@ -0,0 +1,43 @@ +upstream phoenix_backend { + # Hash based on WebSocket handshake key for sticky sessions + # Note: Each new connection gets a new key, so reconnections may route differently + hash $http_sec_websocket_key consistent; + + # Failover configuration: mark server as down after 3 failed attempts within 30s + # Server will be retried after 30s + server phoenix1:4000 max_fails=1 fail_timeout=30s; + server phoenix2:4000 max_fails=1 fail_timeout=30s; + server phoenix3:4000 max_fails=1 fail_timeout=30s; +} + +server { + listen 80; + server_name localhost; + + location /socket { + proxy_pass http://phoenix_backend; + proxy_http_version 1.1; + + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + location /api { + proxy_pass http://phoenix_backend; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + location /health { + access_log off; + return 200 "healthy\n"; + add_header Content-Type text/plain; + } +} diff --git a/otel-collector-config.yaml b/otel-collector-config.yaml new file mode 100644 index 0000000..7defe32 --- /dev/null +++ b/otel-collector-config.yaml @@ -0,0 +1,45 @@ +receivers: + otlp: + protocols: + http: + endpoint: 0.0.0.0:4318 + grpc: + endpoint: 0.0.0.0:4317 + +processors: + batch: + timeout: 1s + send_batch_size: 1024 + +exporters: + # Log to console for debugging + logging: + loglevel: info + + # Export to Prometheus + prometheus: + endpoint: 0.0.0.0:8889 + namespace: websocket_game + + # Uncomment to export to Jaeger for traces + # jaeger: + # endpoint: jaeger:14250 + # tls: + # insecure: true + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [logging] + + metrics: + receivers: [otlp] + processors: [batch] + exporters: [logging, prometheus] + + logs: + receivers: [otlp] + processors: [batch] + exporters: [logging]