works eventually, still shaky
This commit is contained in:
3
.vscode/extensions.json
vendored
Normal file
3
.vscode/extensions.json
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
{
|
||||
"recommendations": ["jakebecker.elixir-ls"]
|
||||
}
|
||||
@@ -1,236 +0,0 @@
|
||||
# Distributed Phoenix WebSocket Game
|
||||
|
||||
This project demonstrates a distributed Phoenix application with automatic failover using native Erlang clustering.
|
||||
|
||||
## Architecture
|
||||
|
||||
- **3 Phoenix Nodes**: Running in Docker containers, forming a distributed Erlang cluster
|
||||
- **Global Process Registry**: Uses `:global` to ensure only one `GameState` GenServer runs across the cluster
|
||||
- **Automatic Failover**: If a node goes down, another node automatically takes over the GameState
|
||||
- **Nginx Load Balancer**: Routes WebSocket connections to healthy nodes
|
||||
- **Client Failover**: Frontend automatically switches to another server if connection is lost
|
||||
|
||||
## How It Works
|
||||
|
||||
### Distributed Erlang Clustering
|
||||
|
||||
- Each Phoenix container starts with a unique node name (e.g., `backend@phoenix1`)
|
||||
- All nodes share the same Erlang cookie for authentication
|
||||
- `Backend.Cluster` module automatically connects nodes on startup
|
||||
- Nodes use EPMD (Erlang Port Mapper Daemon) for discovery
|
||||
|
||||
### Singleton Game State
|
||||
|
||||
- `Backend.GameState` is registered globally using `{:global, __MODULE__}`
|
||||
- Only ONE instance runs across all nodes at any time
|
||||
- If the node running GameState crashes, Erlang automatically starts it on another node
|
||||
- All nodes can communicate with the GameState regardless of where it's running
|
||||
|
||||
### Client Failover
|
||||
|
||||
- Frontend maintains a list of all backend servers
|
||||
- Automatically reconnects to the next server if connection fails
|
||||
- Uses exponential backoff and retry logic
|
||||
- Displays current connection status
|
||||
|
||||
## Setup
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- Docker and Docker Compose
|
||||
- Or: Elixir 1.15+, Erlang 26+, Node.js 18+
|
||||
|
||||
### Running with Docker
|
||||
|
||||
1. Build and start all services:
|
||||
|
||||
```bash
|
||||
docker-compose up --build
|
||||
```
|
||||
|
||||
This starts:
|
||||
- `phoenix1` on port 4001
|
||||
- `phoenix2` on port 4002
|
||||
- `phoenix3` on port 4003
|
||||
- `nginx` load balancer on port 4000
|
||||
|
||||
2. Open the client (in a separate terminal):
|
||||
|
||||
```bash
|
||||
cd client
|
||||
pnpm install
|
||||
pnpm dev
|
||||
```
|
||||
|
||||
3. Open http://localhost:5173 in your browser
|
||||
|
||||
### Running Locally (Development)
|
||||
|
||||
Terminal 1 - Backend Node 1:
|
||||
```bash
|
||||
cd backend
|
||||
mix deps.get
|
||||
export RELEASE_NODE=backend@127.0.0.1
|
||||
export RELEASE_COOKIE=mycookie
|
||||
export PORT=4001
|
||||
export CLUSTER_NODES="backend@127.0.0.1"
|
||||
iex --name backend@127.0.0.1 --cookie mycookie -S mix phx.server
|
||||
```
|
||||
|
||||
Terminal 2 - Backend Node 2:
|
||||
```bash
|
||||
cd backend
|
||||
export RELEASE_NODE=backend@127.0.0.2
|
||||
export RELEASE_COOKIE=mycookie
|
||||
export PORT=4002
|
||||
export CLUSTER_NODES="backend@127.0.0.1,backend@127.0.0.2"
|
||||
iex --name backend@127.0.0.2 --cookie mycookie -S mix phx.server
|
||||
```
|
||||
|
||||
Terminal 3 - Frontend:
|
||||
```bash
|
||||
cd client
|
||||
pnpm install
|
||||
pnpm dev
|
||||
```
|
||||
|
||||
## Testing Failover
|
||||
|
||||
### Test 1: Stop a node
|
||||
|
||||
```bash
|
||||
# Stop one container
|
||||
docker-compose stop phoenix1
|
||||
|
||||
# The game continues running on phoenix2 or phoenix3
|
||||
# Clients automatically reconnect to available nodes
|
||||
```
|
||||
|
||||
### Test 2: Kill the node running GameState
|
||||
|
||||
1. Find which node is running GameState:
|
||||
```bash
|
||||
docker-compose exec phoenix1 /app/bin/backend remote
|
||||
# In the IEx shell:
|
||||
:global.whereis_name(Backend.GameState)
|
||||
# This shows {pid, node_name}
|
||||
```
|
||||
|
||||
2. Stop that specific node:
|
||||
```bash
|
||||
docker-compose stop phoenix2 # or whichever node is running it
|
||||
```
|
||||
|
||||
3. The GameState automatically starts on another node
|
||||
4. All players remain in the game
|
||||
|
||||
### Test 3: Network partition
|
||||
|
||||
```bash
|
||||
# Disconnect a node from the network
|
||||
docker network disconnect websocket-testing_app_net phoenix3
|
||||
|
||||
# Reconnect it
|
||||
docker network connect websocket-testing_app_net phoenix3
|
||||
```
|
||||
|
||||
## Monitoring the Cluster
|
||||
|
||||
### Check connected nodes
|
||||
|
||||
```bash
|
||||
docker-compose exec phoenix1 /app/bin/backend remote
|
||||
```
|
||||
|
||||
In the IEx shell:
|
||||
```elixir
|
||||
# List all connected nodes
|
||||
Node.list()
|
||||
|
||||
# Check which node is running GameState
|
||||
:global.whereis_name(Backend.GameState)
|
||||
|
||||
# Get current game state
|
||||
Backend.GameState.get_state()
|
||||
|
||||
# Check registered global processes
|
||||
:global.registered_names()
|
||||
```
|
||||
|
||||
### View logs
|
||||
|
||||
```bash
|
||||
# All containers
|
||||
docker-compose logs -f
|
||||
|
||||
# Specific container
|
||||
docker-compose logs -f phoenix1
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
### Environment Variables
|
||||
|
||||
- `RELEASE_NODE`: Node name (e.g., `backend@phoenix1`)
|
||||
- `RELEASE_COOKIE`: Erlang cookie for cluster authentication
|
||||
- `CLUSTER_NODES`: Comma-separated list of nodes to connect to
|
||||
- `PORT`: HTTP port for Phoenix endpoint
|
||||
- `SECRET_KEY_BASE`: Phoenix secret key
|
||||
|
||||
### Scaling
|
||||
|
||||
To add more nodes, edit `docker-compose.yml`:
|
||||
|
||||
```yaml
|
||||
phoenix4:
|
||||
# Same config as phoenix1-3, with unique:
|
||||
# - container_name: phoenix4
|
||||
# - hostname: phoenix4
|
||||
# - RELEASE_NODE: backend@phoenix4
|
||||
# - ports: "4004:4000"
|
||||
# - ipv4_address: 172.25.0.14
|
||||
```
|
||||
|
||||
Update `CLUSTER_NODES` in all services to include `backend@phoenix4`.
|
||||
|
||||
## How to Play
|
||||
|
||||
- Use **WASD** keys to move your player
|
||||
- Your player is shown in red, others in blue
|
||||
- The game state is shared across all nodes
|
||||
- Try killing nodes to see failover in action!
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Nodes not connecting
|
||||
|
||||
1. Check all nodes have the same `RELEASE_COOKIE`
|
||||
2. Verify EPMD is running: `docker-compose exec phoenix1 epmd -names`
|
||||
3. Check firewall allows ports 4369 (EPMD) and 9000-9100 (distributed Erlang)
|
||||
|
||||
### GameState not starting
|
||||
|
||||
1. Check logs: `docker-compose logs -f`
|
||||
2. Verify only one instance exists globally: `:global.registered_names()`
|
||||
3. Restart all nodes: `docker-compose restart`
|
||||
|
||||
### Frontend not connecting
|
||||
|
||||
1. Check nginx is running: `docker-compose ps nginx`
|
||||
2. Verify at least one Phoenix node is healthy
|
||||
3. Check browser console for connection errors
|
||||
4. Try connecting directly to a node: http://localhost:4001
|
||||
|
||||
## Production Considerations
|
||||
|
||||
- **Change the Erlang cookie**: Use a strong secret
|
||||
- **Use proper SSL/TLS**: Configure HTTPS for WebSocket connections
|
||||
- **Add health checks**: Monitor node health and GameState availability
|
||||
- **Persistent storage**: Add database for game state persistence
|
||||
- **Rate limiting**: Protect against abuse
|
||||
- **Monitoring**: Add Prometheus/Grafana for metrics
|
||||
- **Logging**: Centralize logs with ELK or similar
|
||||
|
||||
## License
|
||||
|
||||
MIT
|
||||
@@ -11,76 +11,68 @@ interface GameState {
|
||||
[playerId: string]: Player;
|
||||
}
|
||||
|
||||
// List of WebSocket servers - we'll connect to all of them
|
||||
const WS_SERVERS = [
|
||||
"ws://localhost:4001/socket",
|
||||
"ws://localhost:4002/socket",
|
||||
"ws://localhost:4003/socket",
|
||||
];
|
||||
// Connect to nginx load balancer
|
||||
const WS_SERVER = "ws://localhost:4000/socket";
|
||||
|
||||
function App() {
|
||||
const [players, setPlayers] = useState<GameState>({});
|
||||
const [myPlayerId, setMyPlayerId] = useState<string | null>(null);
|
||||
const [connectionStatus, setConnectionStatus] =
|
||||
useState<string>("connecting");
|
||||
const socketsRef = useRef<Socket[]>([]);
|
||||
const channelsRef = useRef<Channel[]>([]);
|
||||
const socketRef = useRef<Socket | null>(null);
|
||||
const channelRef = useRef<Channel | null>(null);
|
||||
const keysPressed = useRef<Set<string>>(new Set());
|
||||
|
||||
useEffect(() => {
|
||||
// Connect to all servers concurrently
|
||||
const sockets = WS_SERVERS.map((serverUrl) => {
|
||||
console.log(`Connecting to ${serverUrl}`);
|
||||
// Connect to nginx load balancer
|
||||
console.log(`Connecting to ${WS_SERVER}`);
|
||||
|
||||
const socket = new Socket(serverUrl, {
|
||||
const socket = new Socket(WS_SERVER, {
|
||||
timeout: 3000,
|
||||
reconnectAfterMs: () => 2000, // Keep trying to reconnect
|
||||
reconnectAfterMs: (tries) =>
|
||||
[1000, 2000, 5000, 10000][tries - 1] || 10000,
|
||||
});
|
||||
|
||||
// Handle connection events
|
||||
socket.onOpen(() => {
|
||||
console.log(`✓ Connected to ${serverUrl}`);
|
||||
updateConnectionStatus();
|
||||
console.log(`✓ Connected to load balancer`);
|
||||
setConnectionStatus("Connected");
|
||||
});
|
||||
|
||||
socket.onError((error) => {
|
||||
console.error(`✗ Error on ${serverUrl}:`, error);
|
||||
updateConnectionStatus();
|
||||
console.error(`✗ Connection error:`, error);
|
||||
setConnectionStatus("Connection error");
|
||||
});
|
||||
|
||||
socket.onClose(() => {
|
||||
console.log(`✗ Disconnected from ${serverUrl}`);
|
||||
updateConnectionStatus();
|
||||
console.log(`✗ Disconnected from load balancer`);
|
||||
setConnectionStatus("Disconnected - reconnecting...");
|
||||
});
|
||||
|
||||
socket.connect();
|
||||
return socket;
|
||||
});
|
||||
socketRef.current = socket;
|
||||
|
||||
socketsRef.current = sockets;
|
||||
|
||||
// Join game channel on all connected sockets
|
||||
const channels = sockets.map((socket, index) => {
|
||||
// Join game channel
|
||||
const channel = socket.channel("game:lobby", {});
|
||||
|
||||
channel
|
||||
.join()
|
||||
.receive("ok", () => {
|
||||
console.log(`✓ Joined channel on ${WS_SERVERS[index]}`);
|
||||
updateConnectionStatus();
|
||||
console.log(`✓ Joined game channel`);
|
||||
setConnectionStatus("Connected & playing");
|
||||
})
|
||||
.receive("error", (resp) => {
|
||||
console.log(`✗ Failed to join on ${WS_SERVERS[index]}:`, resp);
|
||||
console.log(`✗ Failed to join:`, resp);
|
||||
setConnectionStatus("Failed to join game");
|
||||
})
|
||||
.receive("timeout", () => {
|
||||
console.log(`✗ Timeout joining on ${WS_SERVERS[index]}`);
|
||||
console.log(`✗ Timeout joining`);
|
||||
setConnectionStatus("Connection timeout");
|
||||
});
|
||||
|
||||
// Listen for game state updates from any server
|
||||
// Listen for game state updates
|
||||
channel.on("game_state", (payload: { players: GameState }) => {
|
||||
setPlayers(payload.players);
|
||||
|
||||
// Set our player ID from the first state update if not set
|
||||
if (!myPlayerId && Object.keys(payload.players).length > 0) {
|
||||
const playerIds = Object.keys(payload.players);
|
||||
if (playerIds.length > 0) {
|
||||
@@ -89,28 +81,16 @@ function App() {
|
||||
}
|
||||
});
|
||||
|
||||
return channel;
|
||||
});
|
||||
|
||||
channelsRef.current = channels;
|
||||
|
||||
const updateConnectionStatus = () => {
|
||||
const joined = channels.filter((c) => c.state === "joined").length;
|
||||
setConnectionStatus(`${joined}/${WS_SERVERS.length} servers active`);
|
||||
};
|
||||
|
||||
// Periodic status update
|
||||
const statusInterval = setInterval(updateConnectionStatus, 1000);
|
||||
channelRef.current = channel;
|
||||
|
||||
// Cleanup on unmount
|
||||
return () => {
|
||||
clearInterval(statusInterval);
|
||||
channels.forEach((channel) => channel.leave());
|
||||
sockets.forEach((socket) => socket.disconnect());
|
||||
channel.leave();
|
||||
socket.disconnect();
|
||||
};
|
||||
}, [myPlayerId]);
|
||||
|
||||
// Handle keyboard input - send to first available channel
|
||||
// Handle keyboard input - send to active channel
|
||||
useEffect(() => {
|
||||
const handleKeyDown = (e: KeyboardEvent) => {
|
||||
const key = e.key.toLowerCase();
|
||||
@@ -121,11 +101,9 @@ function App() {
|
||||
if (!keysPressed.current.has(key)) {
|
||||
keysPressed.current.add(key);
|
||||
|
||||
// Send to first joined channel (they all share same game state)
|
||||
const activeChannel = channelsRef.current.find(
|
||||
(c) => c.state === "joined",
|
||||
);
|
||||
if (activeChannel) {
|
||||
// Send to active channel
|
||||
const activeChannel = channelRef.current;
|
||||
if (activeChannel && activeChannel.state === "joined") {
|
||||
activeChannel.push("move", { direction: key });
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,6 +17,7 @@ services:
|
||||
- DATABASE_URL=ecto://postgres:postgres@db/backend_dev
|
||||
- SECRET_KEY_BASE=W8nGKNhNR8vKj6A4VnwN5h5h7RZvkKmZPqxqzLzYxXGQqC6HnKp2Wm8MNqKpQxZv
|
||||
- CLUSTER_NODES=backend@phoenix1,backend@phoenix2,backend@phoenix3
|
||||
- OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318
|
||||
ports:
|
||||
- "4001:4000"
|
||||
healthcheck:
|
||||
@@ -41,6 +42,7 @@ services:
|
||||
- DATABASE_URL=ecto://postgres:postgres@db/backend_dev
|
||||
- SECRET_KEY_BASE=W8nGKNhNR8vKj6A4VnwN5h5h7RZvkKmZPqxqzLzYxXGQqC6HnKp2Wm8MNqKpQxZv
|
||||
- CLUSTER_NODES=backend@phoenix1,backend@phoenix2,backend@phoenix3
|
||||
- OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318
|
||||
ports:
|
||||
- "4002:4000"
|
||||
healthcheck:
|
||||
@@ -65,6 +67,7 @@ services:
|
||||
- DATABASE_URL=ecto://postgres:postgres@db/backend_dev
|
||||
- SECRET_KEY_BASE=W8nGKNhNR8vKj6A4VnwN5h5h7RZvkKmZPqxqzLzYxXGQqC6HnKp2Wm8MNqKpQxZv
|
||||
- CLUSTER_NODES=backend@phoenix1,backend@phoenix2,backend@phoenix3
|
||||
- OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318
|
||||
ports:
|
||||
- "4003:4000"
|
||||
healthcheck:
|
||||
@@ -73,6 +76,18 @@ services:
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
nginx-lb:
|
||||
image: nginx:alpine
|
||||
container_name: nginx-lb
|
||||
volumes:
|
||||
- ./nginx-lb.conf:/etc/nginx/conf.d/default.conf
|
||||
ports:
|
||||
- "4000:80"
|
||||
depends_on:
|
||||
- phoenix1
|
||||
- phoenix2
|
||||
- phoenix3
|
||||
|
||||
client:
|
||||
build:
|
||||
context: ./client
|
||||
@@ -80,3 +95,17 @@ services:
|
||||
container_name: client
|
||||
ports:
|
||||
- "5173:80"
|
||||
depends_on:
|
||||
- nginx-lb
|
||||
|
||||
otel-collector:
|
||||
image: otel/opentelemetry-collector-contrib:latest
|
||||
container_name: otel-collector
|
||||
command: ["--config=/etc/otel-collector-config.yaml"]
|
||||
volumes:
|
||||
- ./otel-collector-config.yaml:/etc/otel-collector-config.yaml
|
||||
ports:
|
||||
- "4318:4318" # OTLP HTTP receiver
|
||||
- "4317:4317" # OTLP gRPC receiver
|
||||
- "8888:8888" # Prometheus metrics
|
||||
- "8889:8889" # Prometheus exporter metrics
|
||||
|
||||
43
nginx-lb.conf
Normal file
43
nginx-lb.conf
Normal file
@@ -0,0 +1,43 @@
|
||||
upstream phoenix_backend {
|
||||
# Hash based on WebSocket handshake key for sticky sessions
|
||||
# Note: Each new connection gets a new key, so reconnections may route differently
|
||||
hash $http_sec_websocket_key consistent;
|
||||
|
||||
# Failover configuration: mark server as down after 3 failed attempts within 30s
|
||||
# Server will be retried after 30s
|
||||
server phoenix1:4000 max_fails=1 fail_timeout=30s;
|
||||
server phoenix2:4000 max_fails=1 fail_timeout=30s;
|
||||
server phoenix3:4000 max_fails=1 fail_timeout=30s;
|
||||
}
|
||||
|
||||
server {
|
||||
listen 80;
|
||||
server_name localhost;
|
||||
|
||||
location /socket {
|
||||
proxy_pass http://phoenix_backend;
|
||||
proxy_http_version 1.1;
|
||||
|
||||
proxy_set_header Upgrade $http_upgrade;
|
||||
proxy_set_header Connection "upgrade";
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
}
|
||||
|
||||
location /api {
|
||||
proxy_pass http://phoenix_backend;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
}
|
||||
|
||||
location /health {
|
||||
access_log off;
|
||||
return 200 "healthy\n";
|
||||
add_header Content-Type text/plain;
|
||||
}
|
||||
}
|
||||
45
otel-collector-config.yaml
Normal file
45
otel-collector-config.yaml
Normal file
@@ -0,0 +1,45 @@
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
http:
|
||||
endpoint: 0.0.0.0:4318
|
||||
grpc:
|
||||
endpoint: 0.0.0.0:4317
|
||||
|
||||
processors:
|
||||
batch:
|
||||
timeout: 1s
|
||||
send_batch_size: 1024
|
||||
|
||||
exporters:
|
||||
# Log to console for debugging
|
||||
logging:
|
||||
loglevel: info
|
||||
|
||||
# Export to Prometheus
|
||||
prometheus:
|
||||
endpoint: 0.0.0.0:8889
|
||||
namespace: websocket_game
|
||||
|
||||
# Uncomment to export to Jaeger for traces
|
||||
# jaeger:
|
||||
# endpoint: jaeger:14250
|
||||
# tls:
|
||||
# insecure: true
|
||||
|
||||
service:
|
||||
pipelines:
|
||||
traces:
|
||||
receivers: [otlp]
|
||||
processors: [batch]
|
||||
exporters: [logging]
|
||||
|
||||
metrics:
|
||||
receivers: [otlp]
|
||||
processors: [batch]
|
||||
exporters: [logging, prometheus]
|
||||
|
||||
logs:
|
||||
receivers: [otlp]
|
||||
processors: [batch]
|
||||
exporters: [logging]
|
||||
Reference in New Issue
Block a user