diff --git a/Dockerfile b/Dockerfile index 7cfdcf0e..c14e2e98 100644 --- a/Dockerfile +++ b/Dockerfile @@ -178,6 +178,9 @@ COPY deploy/docker/* ${APP_HOME}/ # copy the playground + any future static assets COPY deploy/docker/static ${APP_HOME}/static +# Make entrypoint executable +RUN chmod +x ${APP_HOME}/entrypoint.sh + # Change ownership of the application directory to the non-root user RUN chown -R appuser:appuser ${APP_HOME} @@ -191,15 +194,17 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ echo "⚠️ Warning: Less than 2GB RAM available! Your container might need a memory boost! 🚀"; \ exit 1; \ fi && \ - redis-cli ping > /dev/null && \ + if [ "${CRAWL4AI_DISABLE_EMBEDDED_REDIS}" != "true" ]; then \ + redis-cli ping > /dev/null || exit 1; \ + fi && \ curl -f http://localhost:11235/health || exit 1' EXPOSE 6379 # Switch to the non-root user before starting the application USER appuser -# Set environment variables to ptoduction -ENV PYTHON_ENV=production +# Set environment variables to production +ENV PYTHON_ENV=production -# Start the application using supervisord -CMD ["supervisord", "-c", "supervisord.conf"] \ No newline at end of file +# Start the application using entrypoint (handles conditional Redis) +ENTRYPOINT ["/app/entrypoint.sh"] \ No newline at end of file diff --git a/deploy/docker/README.md b/deploy/docker/README.md index c3c968f4..75746684 100644 --- a/deploy/docker/README.md +++ b/deploy/docker/README.md @@ -881,8 +881,13 @@ llm: provider: "openai/gpt-4o-mini" # Can be overridden by LLM_PROVIDER env var # api_key: sk-... # If you pass the API key directly (not recommended) -# Redis Configuration (Used by internal Redis server managed by supervisord) +# Redis Configuration +# By default, an embedded Redis server runs inside the container. +# To use an external Redis instead, set these environment variables: +# REDIS_URL=redis://:password@hostname:6379/0 +# CRAWL4AI_DISABLE_EMBEDDED_REDIS=true redis: + # uri: "redis://localhost:6379/0" # Override with full URI (or use REDIS_URL env var) host: "localhost" port: 6379 db: 0 @@ -990,12 +995,26 @@ You can override the default `config.yml`. - Set timeouts according to your content size and network conditions - Use Redis for rate limiting in multi-container setups -3. **Monitoring** 📊 +3. **External Redis** 🔗 + - By default, Crawl4AI runs an embedded Redis server inside the container + - For production deployments, you may want to use an external Redis for: + - Better memory management (embedded Redis can grow unboundedly) + - Shared state across multiple Crawl4AI containers + - Easier Redis version management and security patching + - To use external Redis: + ```bash + docker run -d -p 11235:11235 \ + -e REDIS_URL=redis://:password@your-redis-host:6379/0 \ + -e CRAWL4AI_DISABLE_EMBEDDED_REDIS=true \ + unclecode/crawl4ai:latest + ``` + +4. **Monitoring** 📊 - Enable Prometheus if you need metrics - Set DEBUG logging in development, INFO in production - Regular health check monitoring is crucial -4. **Performance Tuning** ⚡ +5. **Performance Tuning** ⚡ - Start with conservative rate limiter delays - Increase batch_process timeout for large content - Adjust stream_init timeout based on initial response times diff --git a/deploy/docker/config.yml b/deploy/docker/config.yml index db3193a6..692876d6 100644 --- a/deploy/docker/config.yml +++ b/deploy/docker/config.yml @@ -14,7 +14,12 @@ llm: # api_key: sk-... # If you pass the API key directly (not recommended) # Redis Configuration +# To use external Redis instead of embedded, set REDIS_URL environment variable: +# REDIS_URL=redis://:password@hostname:6379/0 +# When using external Redis, also set CRAWL4AI_DISABLE_EMBEDDED_REDIS=true +# to prevent the embedded Redis from starting. redis: + # uri: "redis://localhost:6379/0" # Override with full URI (or use REDIS_URL env var) host: "localhost" port: 6379 db: 0 @@ -24,10 +29,6 @@ redis: ssl_ca_certs: None ssl_certfile: None ssl_keyfile: None - ssl_cert_reqs: None - ssl_ca_certs: None - ssl_certfile: None - ssl_keyfile: None # Rate Limiting Configuration rate_limiting: diff --git a/deploy/docker/entrypoint.sh b/deploy/docker/entrypoint.sh new file mode 100644 index 00000000..597df062 --- /dev/null +++ b/deploy/docker/entrypoint.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Crawl4AI Docker Entrypoint +# Handles conditional embedded Redis startup based on CRAWL4AI_DISABLE_EMBEDDED_REDIS + +set -e + +# If CRAWL4AI_DISABLE_EMBEDDED_REDIS is set to true, modify supervisord.conf +# to remove the Redis program section +if [ "${CRAWL4AI_DISABLE_EMBEDDED_REDIS}" = "true" ]; then + echo "External Redis mode: Disabling embedded Redis server" + + # Create a modified supervisord.conf without Redis + cat > /tmp/supervisord.conf << 'EOF' +[supervisord] +nodaemon=true +logfile=/dev/null +logfile_maxbytes=0 + +[program:gunicorn] +command=/usr/local/bin/gunicorn --bind 0.0.0.0:11235 --workers 1 --threads 4 --timeout 1800 --graceful-timeout 30 --keep-alive 300 --log-level info --worker-class uvicorn.workers.UvicornWorker server:app +directory=/app +user=appuser +autorestart=true +priority=20 +environment=PYTHONUNBUFFERED=1 +stdout_logfile=/dev/stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr +stderr_logfile_maxbytes=0 +EOF + + exec supervisord -c /tmp/supervisord.conf +else + # Default: use the original supervisord.conf with embedded Redis + exec supervisord -c supervisord.conf +fi diff --git a/deploy/docker/utils.py b/deploy/docker/utils.py index 52f4e11f..70c025b4 100644 --- a/deploy/docker/utils.py +++ b/deploy/docker/utils.py @@ -36,7 +36,13 @@ def load_config() -> Dict: if llm_api_key and "api_key" not in config["llm"]: config["llm"]["api_key"] = llm_api_key logging.info("LLM API key loaded from LLM_API_KEY environment variable") - + + # Override Redis URI from environment if set (for external Redis support) + redis_url = os.environ.get("REDIS_URL") + if redis_url: + config["redis"]["uri"] = redis_url + logging.info("Redis URI overridden from REDIS_URL environment variable") + return config def setup_logging(config: Dict) -> None: diff --git a/tests/docker/test_external_redis.py b/tests/docker/test_external_redis.py new file mode 100644 index 00000000..28e4fb0e --- /dev/null +++ b/tests/docker/test_external_redis.py @@ -0,0 +1,175 @@ +""" +Tests for external Redis configuration support. + +Tests the ability to configure Crawl4AI to use an external Redis server +instead of the embedded Redis via the REDIS_URL environment variable +and CRAWL4AI_DISABLE_EMBEDDED_REDIS flag. +""" + +import os +import pytest +import tempfile +import subprocess +from pathlib import Path +from unittest.mock import patch + + +class TestRedisUrlEnvironmentVariable: + """Tests for REDIS_URL environment variable handling in load_config().""" + + def test_load_config_without_redis_url(self): + """Default config should not have redis.uri set.""" + # Ensure REDIS_URL is not set + env = os.environ.copy() + env.pop("REDIS_URL", None) + + with patch.dict(os.environ, env, clear=True): + # Import fresh to pick up env changes + import importlib + import sys + + # Remove cached module if exists + if "deploy.docker.utils" in sys.modules: + del sys.modules["deploy.docker.utils"] + + # Add deploy/docker to path temporarily + deploy_docker_path = Path(__file__).parent.parent.parent / "deploy" / "docker" + sys.path.insert(0, str(deploy_docker_path)) + + try: + from utils import load_config + config = load_config() + + # uri should not be set by default + assert "uri" not in config["redis"] or config["redis"].get("uri") is None + # default host should be localhost + assert config["redis"]["host"] == "localhost" + assert config["redis"]["port"] == 6379 + finally: + sys.path.remove(str(deploy_docker_path)) + + def test_load_config_with_redis_url(self): + """REDIS_URL env var should override redis.uri in config.""" + test_redis_url = "redis://:mypassword@external-redis.example.com:6380/2" + + env = os.environ.copy() + env["REDIS_URL"] = test_redis_url + + with patch.dict(os.environ, env, clear=False): + import importlib + import sys + + if "deploy.docker.utils" in sys.modules: + del sys.modules["deploy.docker.utils"] + + deploy_docker_path = Path(__file__).parent.parent.parent / "deploy" / "docker" + sys.path.insert(0, str(deploy_docker_path)) + + try: + from utils import load_config + config = load_config() + + assert config["redis"]["uri"] == test_redis_url + finally: + sys.path.remove(str(deploy_docker_path)) + + +class TestEntrypointScript: + """Tests for the entrypoint.sh script behavior.""" + + @pytest.fixture + def entrypoint_path(self): + """Return path to the entrypoint script.""" + return Path(__file__).parent.parent.parent / "deploy" / "docker" / "entrypoint.sh" + + def test_entrypoint_script_exists(self, entrypoint_path): + """Entrypoint script should exist.""" + assert entrypoint_path.exists(), f"entrypoint.sh not found at {entrypoint_path}" + + def test_entrypoint_script_is_executable_content(self, entrypoint_path): + """Entrypoint script should have proper shebang.""" + content = entrypoint_path.read_text() + assert content.startswith("#!/bin/bash"), "entrypoint.sh should start with bash shebang" + + def test_entrypoint_checks_disable_redis_env(self, entrypoint_path): + """Entrypoint should check CRAWL4AI_DISABLE_EMBEDDED_REDIS variable.""" + content = entrypoint_path.read_text() + assert "CRAWL4AI_DISABLE_EMBEDDED_REDIS" in content, \ + "entrypoint.sh should reference CRAWL4AI_DISABLE_EMBEDDED_REDIS" + + def test_entrypoint_generates_supervisord_without_redis(self, entrypoint_path): + """When CRAWL4AI_DISABLE_EMBEDDED_REDIS=true, supervisord config should not include redis.""" + content = entrypoint_path.read_text() + + # Should have logic to create config without redis + assert "supervisord.conf" in content + # Should NOT have [program:redis] in the generated config when disabled + # The script generates a config that only has gunicorn + assert "[program:gunicorn]" in content + + def test_entrypoint_default_uses_original_supervisord(self, entrypoint_path): + """Default behavior should use original supervisord.conf with redis.""" + content = entrypoint_path.read_text() + + # Should have else branch that uses original supervisord.conf + assert "supervisord -c supervisord.conf" in content or \ + "exec supervisord -c supervisord.conf" in content + + +class TestDockerfileHealthcheck: + """Tests for Dockerfile healthcheck configuration.""" + + @pytest.fixture + def dockerfile_path(self): + """Return path to the Dockerfile.""" + return Path(__file__).parent.parent.parent / "Dockerfile" + + def test_dockerfile_exists(self, dockerfile_path): + """Dockerfile should exist.""" + assert dockerfile_path.exists(), f"Dockerfile not found at {dockerfile_path}" + + def test_healthcheck_conditional_redis(self, dockerfile_path): + """Healthcheck should conditionally check redis based on CRAWL4AI_DISABLE_EMBEDDED_REDIS.""" + content = dockerfile_path.read_text() + + assert "HEALTHCHECK" in content + assert "CRAWL4AI_DISABLE_EMBEDDED_REDIS" in content, \ + "Healthcheck should reference CRAWL4AI_DISABLE_EMBEDDED_REDIS for conditional redis check" + + def test_healthcheck_still_checks_app_health(self, dockerfile_path): + """Healthcheck should always check the application health endpoint.""" + content = dockerfile_path.read_text() + + assert "curl" in content and "health" in content, \ + "Healthcheck should curl the /health endpoint" + + +class TestConfigYmlDocumentation: + """Tests for config.yml documentation of external Redis.""" + + @pytest.fixture + def config_path(self): + """Return path to config.yml.""" + return Path(__file__).parent.parent.parent / "deploy" / "docker" / "config.yml" + + def test_config_yml_exists(self, config_path): + """config.yml should exist.""" + assert config_path.exists(), f"config.yml not found at {config_path}" + + def test_config_documents_redis_url(self, config_path): + """config.yml should document REDIS_URL environment variable.""" + content = config_path.read_text() + + assert "REDIS_URL" in content, \ + "config.yml should document REDIS_URL environment variable" + + def test_config_documents_disable_embedded_redis(self, config_path): + """config.yml should document CRAWL4AI_DISABLE_EMBEDDED_REDIS option.""" + content = config_path.read_text() + + assert "CRAWL4AI_DISABLE_EMBEDDED_REDIS" in content, \ + "config.yml should document CRAWL4AI_DISABLE_EMBEDDED_REDIS option" + + +if __name__ == "__main__": + pytest.main([__file__, "-v"])