Hi,
We're using Druid in our real time analytics pipeline and would appreciate some help with tuning the configurations for better query performance and stability in general.
We're using 3 servers with 256GB RAM and 16 physical cores for Druid. 5 machines with HDFS (4 of which are colocated with Druid) all having 1TB of disk space.
We've containerized druid. On each of the 3 druid machines, we have
1) one container that's running the historical and middle manager processes
2) one container that's serving as the broker node
3) one of the machines is running a container with the coordinator and overlord processes
We are ingesting data from Kafka using Kafka Indexing service. We have 3 topics with 64 partitions each. We assign 12 indexing tasks per topic and have a task duration of 10 mins and segment granularity of 10 mins (are we creating too many segments which is affecting the query performance?).
Find below our cluster configurations. Could someone please review and offer some advice on tuning our cluster?
Thanks in advance,
Avinash
Broker:
jvm configs
-server
-Xms24g
-Xmx24g
-XX:NewSize=6g
-XX:NewSize=6g
-XX:MaxDirectMemorySize=64g
-XX:+PrintGCDetails
-XX:+PrintGCTimeStamps
-XX:+PrintGCDateStamps
-XX:+HeapDumpOnOutOfMemoryError
-XX:HeapDumpPath=/monitor/druid/logs
-Duser.timezone=UTC
-Dfile.encoding=UTF-8
-Djava.io.tmpdir=/monitor/druid/tmp
-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
runtime.properties
druid.service=druid/broker
druid.port=9082
druid.host=<host>
# HTTP server threads
druid.broker.http.numConnections=20
druid.server.http.numThreads=50
# Processing threads and buffers
druid.processing.numThreads=7
# Query cache disabled -- push down caching and merging instead
druid.broker.cache.useCache=false
druid.broker.cache.populateCache=false
Coordinator
jvm configs
-server
-Xms10g
-Xmx10g
-XX:NewSize=512m
-XX:NewSize=512m
-XX:MaxDirectMemorySize=10g
-XX:+UseG1GC
-XX:+PrintGCDetails
-XX:+PrintGCTimeStamps
-XX:+PrintGCDateStamps
-XX:+HeapDumpOnOutOfMemoryError
-XX:HeapDumpPath=/monitor/druid/logs
-Duser.timezone=UTC
-Dfile.encoding=UTF-8
-Djava.io.tmpdir=/monitor/druid/tmp
-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
-Dderby.stream.error.file=/monitor/druid/tmp/derby.log
runtime.properties
druid.service=druid/coordinator
druid.port=8181
druid.host=<host>
druid.coordinator.startDelay=PT30S
druid.coordinator.period=PT60S
Historical
jvm configs
-server
-Xms12g
-Xmx12g
-XX:NewSize=6g
-XX:NewSize=6g
-XX:MaxDirectMemorySize=32g
-XX:+PrintGCDetails
-XX:+PrintGCTimeStamps
-XX:+PrintGCDateStamps
-XX:+HeapDumpOnOutOfMemoryError
-XX:HeapDumpPath=/monitor/druid/logs
-Duser.timezone=UTC
-Dfile.encoding=UTF-8
-Djava.io.tmpdir=/monitor/druid/tmp
-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
runtime.properties:
druid.service=druid/historical
druid.port=9083
druid.host=<host>
# HTTP server threads
druid.server.http.numThreads=50
# Processing threads and buffers
druid.processing.numThreads=7
# Segment storage
druid.segmentCache.locations=[{"path":"var/druid/segment-cache","maxSize"\:130000000000}]
druid.server.maxSize=130000000000
# Query cache
druid.historical.cache.useCache=true
druid.historical.cache.populateCache=true
druid.cache.type=local
druid.cache.sizeInBytes=2000000000
Middle Manager
jvm configs
-server
-Xms64m
-Xmx64m
-XX:+PrintGCDetails
-XX:+PrintGCTimeStamps
-XX:+PrintGCDateStamps
-XX:+HeapDumpOnOutOfMemoryError
-XX:HeapDumpPath=/monitor/druid/logs
-Duser.timezone=UTC
-Dfile.encoding=UTF-8
-Djava.io.tmpdir=/monitor/druid/tmp
-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
runtime.properties
druid.service=druid/middlemanager
druid.port=9091
druid.host=<host>
# Number of tasks per middleManager
druid.worker.capacity=80
# Task launch parameters
druid.indexer.runner.javaOpts=-server -Xmx3g -XX:MaxDirectMemorySize=4096m -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -Duser.timezone=UTC -Dfile.encoding=UTF-8 -Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
druid.indexer.task.baseTaskDir=var/druid/task
druid.indexer.task.restoreTasksOnRestart=true
# HTTP server threads
druid.server.http.numThreads=40
# Processing threads and buffers
druid.processing.buffer.sizeBytes=536870912
druid.processing.numThreads=2
# Hadoop indexing
druid.indexer.task.hadoopWorkingPath=var/druid/hadoop-tmp
druid.indexer.task.defaultHadoopCoordinates=["org.apache.hadoop:hadoop-client:2.7.3"]
Overlord
jvm configs
-server
-Xms4g
-Xmx4g
-XX:+UseConcMarkSweepGC
-XX:+PrintGCDetails
-XX:+PrintGCTimeStamps
-XX:+PrintGCDateStamps
-XX:+HeapDumpOnOutOfMemoryError
-XX:HeapDumpPath=/monitor/druid/logs
-Duser.timezone=UTC
-Dfile.encoding=UTF-8
-Djava.io.tmpdir=/monitor/druid/tmp
-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
runtime.properties
druid.host=<host>
druid.service=druid/overlord
druid.port=9090
druid.indexer.queue.startDelay=PT30S
druid.indexer.runner.type=remote
druid.indexer.storage.type=metadata