Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,9 @@ COPY deploy/docker/* ${APP_HOME}/
# copy the playground + any future static assets
COPY deploy/docker/static ${APP_HOME}/static

# Make entrypoint executable
RUN chmod +x ${APP_HOME}/entrypoint.sh

# Change ownership of the application directory to the non-root user
RUN chown -R appuser:appuser ${APP_HOME}

Expand All @@ -191,15 +194,17 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
echo "⚠️ Warning: Less than 2GB RAM available! Your container might need a memory boost! 🚀"; \
exit 1; \
fi && \
redis-cli ping > /dev/null && \
if [ "${CRAWL4AI_DISABLE_EMBEDDED_REDIS}" != "true" ]; then \
redis-cli ping > /dev/null || exit 1; \
fi && \
curl -f http://localhost:11235/health || exit 1'

EXPOSE 6379
# Switch to the non-root user before starting the application
USER appuser

# Set environment variables to ptoduction
ENV PYTHON_ENV=production
# Set environment variables to production
ENV PYTHON_ENV=production

# Start the application using supervisord
CMD ["supervisord", "-c", "supervisord.conf"]
# Start the application using entrypoint (handles conditional Redis)
ENTRYPOINT ["/app/entrypoint.sh"]
25 changes: 22 additions & 3 deletions deploy/docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -881,8 +881,13 @@ llm:
provider: "openai/gpt-4o-mini" # Can be overridden by LLM_PROVIDER env var
# api_key: sk-... # If you pass the API key directly (not recommended)

# Redis Configuration (Used by internal Redis server managed by supervisord)
# Redis Configuration
# By default, an embedded Redis server runs inside the container.
# To use an external Redis instead, set these environment variables:
# REDIS_URL=redis://:password@hostname:6379/0
# CRAWL4AI_DISABLE_EMBEDDED_REDIS=true
redis:
# uri: "redis://localhost:6379/0" # Override with full URI (or use REDIS_URL env var)
host: "localhost"
port: 6379
db: 0
Expand Down Expand Up @@ -990,12 +995,26 @@ You can override the default `config.yml`.
- Set timeouts according to your content size and network conditions
- Use Redis for rate limiting in multi-container setups

3. **Monitoring** 📊
3. **External Redis** 🔗
- By default, Crawl4AI runs an embedded Redis server inside the container
- For production deployments, you may want to use an external Redis for:
- Better memory management (embedded Redis can grow unboundedly)
- Shared state across multiple Crawl4AI containers
- Easier Redis version management and security patching
- To use external Redis:
```bash
docker run -d -p 11235:11235 \
-e REDIS_URL=redis://:password@your-redis-host:6379/0 \
-e CRAWL4AI_DISABLE_EMBEDDED_REDIS=true \
unclecode/crawl4ai:latest
```

4. **Monitoring** 📊
- Enable Prometheus if you need metrics
- Set DEBUG logging in development, INFO in production
- Regular health check monitoring is crucial

4. **Performance Tuning** ⚡
5. **Performance Tuning** ⚡
- Start with conservative rate limiter delays
- Increase batch_process timeout for large content
- Adjust stream_init timeout based on initial response times
Expand Down
9 changes: 5 additions & 4 deletions deploy/docker/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@ llm:
# api_key: sk-... # If you pass the API key directly (not recommended)

# Redis Configuration
# To use external Redis instead of embedded, set REDIS_URL environment variable:
# REDIS_URL=redis://:password@hostname:6379/0
# When using external Redis, also set CRAWL4AI_DISABLE_EMBEDDED_REDIS=true
# to prevent the embedded Redis from starting.
redis:
# uri: "redis://localhost:6379/0" # Override with full URI (or use REDIS_URL env var)
host: "localhost"
port: 6379
db: 0
Expand All @@ -24,10 +29,6 @@ redis:
ssl_ca_certs: None
ssl_certfile: None
ssl_keyfile: None
ssl_cert_reqs: None
ssl_ca_certs: None
ssl_certfile: None
ssl_keyfile: None

# Rate Limiting Configuration
rate_limiting:
Expand Down
36 changes: 36 additions & 0 deletions deploy/docker/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/bash
# Crawl4AI Docker Entrypoint
# Handles conditional embedded Redis startup based on CRAWL4AI_DISABLE_EMBEDDED_REDIS

set -e

# If CRAWL4AI_DISABLE_EMBEDDED_REDIS is set to true, modify supervisord.conf
# to remove the Redis program section
if [ "${CRAWL4AI_DISABLE_EMBEDDED_REDIS}" = "true" ]; then
echo "External Redis mode: Disabling embedded Redis server"

# Create a modified supervisord.conf without Redis
cat > /tmp/supervisord.conf << 'EOF'
[supervisord]
nodaemon=true
logfile=/dev/null
logfile_maxbytes=0

[program:gunicorn]
command=/usr/local/bin/gunicorn --bind 0.0.0.0:11235 --workers 1 --threads 4 --timeout 1800 --graceful-timeout 30 --keep-alive 300 --log-level info --worker-class uvicorn.workers.UvicornWorker server:app
directory=/app
user=appuser
autorestart=true
priority=20
environment=PYTHONUNBUFFERED=1
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
EOF

exec supervisord -c /tmp/supervisord.conf
else
# Default: use the original supervisord.conf with embedded Redis
exec supervisord -c supervisord.conf
fi
8 changes: 7 additions & 1 deletion deploy/docker/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,13 @@ def load_config() -> Dict:
if llm_api_key and "api_key" not in config["llm"]:
config["llm"]["api_key"] = llm_api_key
logging.info("LLM API key loaded from LLM_API_KEY environment variable")


# Override Redis URI from environment if set (for external Redis support)
redis_url = os.environ.get("REDIS_URL")
if redis_url:
config["redis"]["uri"] = redis_url
logging.info("Redis URI overridden from REDIS_URL environment variable")

return config

def setup_logging(config: Dict) -> None:
Expand Down
175 changes: 175 additions & 0 deletions tests/docker/test_external_redis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
"""
Tests for external Redis configuration support.

Tests the ability to configure Crawl4AI to use an external Redis server
instead of the embedded Redis via the REDIS_URL environment variable
and CRAWL4AI_DISABLE_EMBEDDED_REDIS flag.
"""

import os
import pytest
import tempfile
import subprocess
from pathlib import Path
from unittest.mock import patch


class TestRedisUrlEnvironmentVariable:
"""Tests for REDIS_URL environment variable handling in load_config()."""

def test_load_config_without_redis_url(self):
"""Default config should not have redis.uri set."""
# Ensure REDIS_URL is not set
env = os.environ.copy()
env.pop("REDIS_URL", None)

with patch.dict(os.environ, env, clear=True):
# Import fresh to pick up env changes
import importlib
import sys

# Remove cached module if exists
if "deploy.docker.utils" in sys.modules:
del sys.modules["deploy.docker.utils"]

# Add deploy/docker to path temporarily
deploy_docker_path = Path(__file__).parent.parent.parent / "deploy" / "docker"
sys.path.insert(0, str(deploy_docker_path))

try:
from utils import load_config
config = load_config()

# uri should not be set by default
assert "uri" not in config["redis"] or config["redis"].get("uri") is None
# default host should be localhost
assert config["redis"]["host"] == "localhost"
assert config["redis"]["port"] == 6379
finally:
sys.path.remove(str(deploy_docker_path))

def test_load_config_with_redis_url(self):
"""REDIS_URL env var should override redis.uri in config."""
test_redis_url = "redis://:mypassword@external-redis.example.com:6380/2"

env = os.environ.copy()
env["REDIS_URL"] = test_redis_url

with patch.dict(os.environ, env, clear=False):
import importlib
import sys

if "deploy.docker.utils" in sys.modules:
del sys.modules["deploy.docker.utils"]

deploy_docker_path = Path(__file__).parent.parent.parent / "deploy" / "docker"
sys.path.insert(0, str(deploy_docker_path))

try:
from utils import load_config
config = load_config()

assert config["redis"]["uri"] == test_redis_url
finally:
sys.path.remove(str(deploy_docker_path))


class TestEntrypointScript:
"""Tests for the entrypoint.sh script behavior."""

@pytest.fixture
def entrypoint_path(self):
"""Return path to the entrypoint script."""
return Path(__file__).parent.parent.parent / "deploy" / "docker" / "entrypoint.sh"

def test_entrypoint_script_exists(self, entrypoint_path):
"""Entrypoint script should exist."""
assert entrypoint_path.exists(), f"entrypoint.sh not found at {entrypoint_path}"

def test_entrypoint_script_is_executable_content(self, entrypoint_path):
"""Entrypoint script should have proper shebang."""
content = entrypoint_path.read_text()
assert content.startswith("#!/bin/bash"), "entrypoint.sh should start with bash shebang"

def test_entrypoint_checks_disable_redis_env(self, entrypoint_path):
"""Entrypoint should check CRAWL4AI_DISABLE_EMBEDDED_REDIS variable."""
content = entrypoint_path.read_text()
assert "CRAWL4AI_DISABLE_EMBEDDED_REDIS" in content, \
"entrypoint.sh should reference CRAWL4AI_DISABLE_EMBEDDED_REDIS"

def test_entrypoint_generates_supervisord_without_redis(self, entrypoint_path):
"""When CRAWL4AI_DISABLE_EMBEDDED_REDIS=true, supervisord config should not include redis."""
content = entrypoint_path.read_text()

# Should have logic to create config without redis
assert "supervisord.conf" in content
# Should NOT have [program:redis] in the generated config when disabled
# The script generates a config that only has gunicorn
assert "[program:gunicorn]" in content

def test_entrypoint_default_uses_original_supervisord(self, entrypoint_path):
"""Default behavior should use original supervisord.conf with redis."""
content = entrypoint_path.read_text()

# Should have else branch that uses original supervisord.conf
assert "supervisord -c supervisord.conf" in content or \
"exec supervisord -c supervisord.conf" in content


class TestDockerfileHealthcheck:
"""Tests for Dockerfile healthcheck configuration."""

@pytest.fixture
def dockerfile_path(self):
"""Return path to the Dockerfile."""
return Path(__file__).parent.parent.parent / "Dockerfile"

def test_dockerfile_exists(self, dockerfile_path):
"""Dockerfile should exist."""
assert dockerfile_path.exists(), f"Dockerfile not found at {dockerfile_path}"

def test_healthcheck_conditional_redis(self, dockerfile_path):
"""Healthcheck should conditionally check redis based on CRAWL4AI_DISABLE_EMBEDDED_REDIS."""
content = dockerfile_path.read_text()

assert "HEALTHCHECK" in content
assert "CRAWL4AI_DISABLE_EMBEDDED_REDIS" in content, \
"Healthcheck should reference CRAWL4AI_DISABLE_EMBEDDED_REDIS for conditional redis check"

def test_healthcheck_still_checks_app_health(self, dockerfile_path):
"""Healthcheck should always check the application health endpoint."""
content = dockerfile_path.read_text()

assert "curl" in content and "health" in content, \
"Healthcheck should curl the /health endpoint"


class TestConfigYmlDocumentation:
"""Tests for config.yml documentation of external Redis."""

@pytest.fixture
def config_path(self):
"""Return path to config.yml."""
return Path(__file__).parent.parent.parent / "deploy" / "docker" / "config.yml"

def test_config_yml_exists(self, config_path):
"""config.yml should exist."""
assert config_path.exists(), f"config.yml not found at {config_path}"

def test_config_documents_redis_url(self, config_path):
"""config.yml should document REDIS_URL environment variable."""
content = config_path.read_text()

assert "REDIS_URL" in content, \
"config.yml should document REDIS_URL environment variable"

def test_config_documents_disable_embedded_redis(self, config_path):
"""config.yml should document CRAWL4AI_DISABLE_EMBEDDED_REDIS option."""
content = config_path.read_text()

assert "CRAWL4AI_DISABLE_EMBEDDED_REDIS" in content, \
"config.yml should document CRAWL4AI_DISABLE_EMBEDDED_REDIS option"


if __name__ == "__main__":
pytest.main([__file__, "-v"])