Favicon of Apache Airflow

Apache Airflow

Create, schedule, and monitor complex workflows using Python. This scalable and extensible platform offers a modern UI and robust integrations for any environment.

Apache Airflow is an open-source platform for developing, scheduling, and monitoring batch-oriented workflows. It allows you to define your data pipelines as code, written entirely in Python. This code-first approach makes workflows dynamic and extensible, allowing you to generate pipelines on the fly and easily create custom operators to fit your specific needs. The architecture is built to be highly scalable, using a message queue to orchestrate an unlimited number of workers.

Key features include:

  • Pure Python: Define complex workflows using standard Python, including loops and date/time formats, without needing to learn XML or command-line interfaces.
  • Modern Web UI: Get a clear overview of your running and completed tasks. Monitor, schedule, and debug your data pipelines from a user-friendly interface.
  • Extensive Integrations: Leverage a vast library of plug-and-play operators to connect with services like Google Cloud, AWS, and Microsoft Azure.
  • Open Source Community: Benefit from a large and active community that continuously contributes improvements and offers support.

Directory Structure

apache-airflow
airflow
config
dags
logs
plugins
postgres-data
redis-data
.env
docker-compose.yml

docker-compose.yml

x-airflow-common:
  &airflow-common
  image: apache/airflow:3.1.7
  environment:
    &airflow-common-env
    AIRFLOW__CORE__EXECUTOR: CeleryExecutor
    AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:${POSTGRES_PASSWORD}@postgres/airflow
    AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:${POSTGRES_PASSWORD}@postgres/airflow
    AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
    AIRFLOW__CORE__FERNET_KEY: ${AIRFLOW_FERNET_KEY}
    AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
    AIRFLOW__CORE__LOAD_EXAMPLES: 'true'
  volumes:
    - ./dags:/opt/airflow/dags
    - ./logs:/opt/airflow/logs
    - ./plugins:/opt/airflow/plugins
    - ./config:/opt/airflow/config
  user: "${AIRFLOW_UID:-50000}:0"
  depends_on:
    &airflow-common-depends-on
    redis:
      condition: service_healthy
    postgres:
      condition: service_healthy

services:
  postgres:
    image: postgres:13
    environment:
      POSTGRES_USER: airflow
      POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
      POSTGRES_DB: airflow
    volumes:
      - ./postgres-data:/var/lib/postgresql/data
    healthcheck:
      test: ["CMD", "pg_isready", "-U", "airflow"]
      interval: 10s
      retries: 5
      start_period: 5s
    restart: always

  redis:
    image: redis:7
    volumes:
      - ./redis-data:/data
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 10s
      timeout: 30s
      retries: 50
      start_period: 30s
    restart: always

  airflow-webserver:
    <<: *airflow-common
    command: webserver
    ports:
      - "8080:8080"
    healthcheck:
      test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 30s
    restart: always
    depends_on:
      <<: *airflow-common-depends-on
      airflow-init:
        condition: service_completed_successfully

  airflow-scheduler:
    <<: *airflow-common
    command: scheduler
    restart: always
    depends_on:
      <<: *airflow-common-depends-on
      airflow-init:
        condition: service_completed_successfully

  airflow-worker:
    <<: *airflow-common
    command: celery worker
    restart: always
    depends_on:
      <<: *airflow-common-depends-on
      airflow-init:
        condition: service_completed_successfully

  airflow-triggerer:
    <<: *airflow-common
    command: triggerer
    restart: always
    depends_on:
      <<: *airflow-common-depends-on
      airflow-init:
        condition: service_completed_successfully

  airflow-flower:
    <<: *airflow-common
    command: celery flower
    ports:
      - "5555:5555"
    healthcheck:
      test: ["CMD", "curl", "--fail", "http://localhost:5555/"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 30s
    restart: always
    depends_on:
      <<: *airflow-common-depends-on
      airflow-init:
        condition: service_completed_successfully

  airflow-init:
    <<: *airflow-common
    entrypoint: /bin/bash
    command:
      - -c
      - |
        mkdir -p /opt/airflow/dags /opt/airflow/logs /opt/airflow/plugins /opt/airflow/config
        chown -R "${AIRFLOW_UID:-50000}:0" /opt/airflow/dags /opt/airflow/logs /opt/airflow/plugins /opt/airflow/config
        runuser -u airflow -- bash -c "airflow db migrate && airflow users create --role Admin --username $${_AIRFLOW_WWW_USER_USERNAME} --password $${_AIRFLOW_WWW_USER_PASSWORD} --email admin@example.com --firstname admin --lastname admin"
    environment:
      <<: *airflow-common-env
      _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME}
      _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD}
    user: "0:0"
    depends_on:
      <<: *airflow-common-depends-on

.env

# Database password
POSTGRES_PASSWORD=super_secret_postgres_password

# Airflow Web UI credentials
_AIRFLOW_WWW_USER_USERNAME=admin
_AIRFLOW_WWW_USER_PASSWORD=super_secret_admin_password

# Fernet key for encrypting connections (Generate a valid one using: python -c "from cryptography.fernet import Fernet; print(Fernet.generate_key().decode())")
AIRFLOW_FERNET_KEY=46BKJoQYlPPOexq0OhDZnIlNepKFf8Ep9HXQ3aS3cQk=

# User ID for Airflow processes (Default is 50000 in the official image)
AIRFLOW_UID=50000
Categories:

Share:

Ad
Favicon

 

  
 

Similar to Apache Airflow

Favicon

 

  
  
Favicon

 

  
  
Favicon