Esta herramienta describe el stack local de Spark con Docker que uso en el blog. La idea es que puedas reproducir cada post sin depender de un cluster remoto.

Si quieres saltar directo a la descarga, ve a Descargar stack.

Qué incluye este stack

  • Spark Master + Workers + History Server.
  • Volúmenes locales para datos, notebooks y exports.
  • Configuración reproducible y gratuita.

Paso a paso (con piezas reales del compose)

1) Spark Master

Ejecuta el scheduler principal y expone la UI en el puerto 8080.
Los workers se registran aquí.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
services:
  spark-master:
    build:
      context: ./docker
      dockerfile: spark/Dockerfile
    restart: unless-stopped
    environment:
      - SPARK_MODE=master
      - SPARK_MASTER_PORT=7077
      - SPARK_MASTER_WEBUI_PORT=8080
    healthcheck:
      test: curl -f http://localhost:8080 || exit 1
      interval: 5s
      timeout: 3s
      retries: 3
      start_period: 1s
    ports:
      - "8080:8080"

2) Spark Workers (2 servicios)

Cada worker aporta CPU/Memoria al cluster.
En este stack tienes dos workers de 2 cores y 2 GB cada uno.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
  spark-worker-1:
    build:
      context: ./docker
      dockerfile: spark/Dockerfile
    depends_on:
      spark-master:
        condition: service_healthy
    environment:
      - SPARK_MODE=worker
      - SPARK_MASTER_HOST=spark-master
      - SPARK_MASTER_PORT=7077
      - SPARK_WORKER_CORES=2
      - SPARK_WORKER_MEMORY=2g
    ports:
      - "8081:8081"

  spark-worker-2:
    build:
      context: ./docker
      dockerfile: spark/Dockerfile
    depends_on:
      spark-master:
        condition: service_healthy
    environment:
      - SPARK_MODE=worker
      - SPARK_MASTER_HOST=spark-master
      - SPARK_MASTER_PORT=7077
      - SPARK_WORKER_CORES=2
      - SPARK_WORKER_MEMORY=2g
    ports:
      - "8082:8081"

3) Spark History Server

Lee los logs de eventos y expone la UI de historial en 18080.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
  spark-history:
    build:
      context: ./docker
      dockerfile: spark/Dockerfile
    depends_on:
      spark-master:
        condition: service_healthy
    ports:
      - "18080:18080"
    environment:
      - SPARK_MODE=history
    volumes:
      - ./docker/spark/conf:/opt/spark/conf:ro
      - ./docker/spark/logs:/tmp/spark-events

4) Jupyter (opcional)

Si mantienes este servicio, tendrás notebooks en 8888 y UI de Spark driver en 4040.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
  jupyter:
    hostname: jupyter
    build:
      context: ./docker
      dockerfile: jupyter/Dockerfile
    depends_on:
      - spark-worker-1
      - spark-worker-2
    ports:
      - "8888:8888"
      - "4040:4040"

Cómo levantar el stack

Desde content/tools/apache-spark/docker/:

1
docker compose up -d

Apaga el stack al terminar:

1
docker compose down

Rutas de datos

  • Coloca datasets en content/tools/apache-spark/docker/workspace/data/
  • Dentro del contenedor se leen desde /home/jovyan/work/data/

Puertos útiles

  • Spark Master UI: http://localhost:8080
  • Workers: http://localhost:8081 y http://localhost:8082
  • Spark History: http://localhost:18080
  • Jupyter: http://localhost:8888 (si está habilitado)

docker-compose.yml (completo)

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
x-spark-volumes: &spark-volumes
  - ./docker/spark/conf:/opt/spark/conf:rw
  - ./workspace:/home/jovyan/work

services:
  spark-master:
    build:
      context: ./docker
      dockerfile: spark/Dockerfile
    restart: unless-stopped
    environment:
      - SPARK_MODE=master
      - SPARK_MASTER_PORT=7077
      - SPARK_MASTER_WEBUI_PORT=8080
    healthcheck:
      test: curl -f http://localhost:8080 || exit 1
      interval: 5s
      timeout: 3s
      retries: 3
      start_period: 1s
    ports:
      - "8080:8080"
    volumes: *spark-volumes
    networks:
      - spark-network

  spark-worker-1:
    build:
      context: ./docker
      dockerfile: spark/Dockerfile
    depends_on:
      spark-master:
        condition: service_healthy
    environment:
      - SPARK_MODE=worker
      - SPARK_MASTER_HOST=spark-master
      - SPARK_MASTER_PORT=7077
      - SPARK_WORKER_CORES=2
      - SPARK_WORKER_MEMORY=2g
    ports:
      - "8081:8081"
    volumes: *spark-volumes
    networks:
      - spark-network
    restart: unless-stopped

  spark-worker-2:
    build:
      context: ./docker
      dockerfile: spark/Dockerfile
    depends_on:
      spark-master:
        condition: service_healthy
    environment:
      - SPARK_MODE=worker
      - SPARK_MASTER_HOST=spark-master
      - SPARK_MASTER_PORT=7077
      - SPARK_WORKER_CORES=2
      - SPARK_WORKER_MEMORY=2g
    ports:
      - "8082:8081"
    volumes: *spark-volumes
    networks:
      - spark-network
    restart: unless-stopped

  spark-history:
    build:
      context: ./docker
      dockerfile: spark/Dockerfile
    depends_on:
      spark-master:
        condition: service_healthy
    ports:
      - "18080:18080"
    environment:
      - SPARK_MODE=history
    volumes:
      - ./docker/spark/conf:/opt/spark/conf:ro
      - ./docker/spark/logs:/tmp/spark-events
    networks:
      - spark-network
    restart: unless-stopped

  jupyter:
    hostname: jupyter
    build:
      context: ./docker
      dockerfile: jupyter/Dockerfile
    depends_on:
      - spark-worker-1
      - spark-worker-2
    env_file:
      - ./docker/jupyter/jupyter.env
    ports:
      - "8888:8888"
      - "4040:4040"
    environment:
      - CHOWN_EXTRA=/tmp/spark-events
      - CHOWN_EXTRA_OPTS=-R
    volumes:
      - ./workspace:/home/jovyan/work
      - ./docker/jupyter/conf:/home/jovyan/.spark/conf:ro
      - ./docker/jupyter/settings:/home/jovyan/.jupyter
      - ./docker/spark/logs:/tmp/spark-events
    networks:
      - spark-network
      - proxy-network

networks:
  proxy-network:
    external: true
    name: proxy-network
  spark-network:
    name: spark-network
    driver: bridge

Descarga

Si no quieres copiar archivos manualmente, descarga el stack completo: