./bin/generate_cluster_xml.py -f hosts -N cluster -p 1024 -S 937567216 -z 0
Min, max, total JVM size
OPT_JVM_SIZE="-server -Xms70g -Xmx70g -Dcom.sun.management.jmxremote"
# New Generation Sizes
OPT_JVM_SIZE_NEW="-XX:NewSize=4096m -XX:MaxNewSize=4096m"
# Type of Garbage Collector to use
OPT_JVM_GC_TYPE="-XX:+UseConcMarkSweepGC -XX:+UseParNewGC"
# Tuning options for the above garbage collector
OPT_JVM_GC_OPTS="-XX:CMSInitiatingOccupancyFraction=70 -XX:SurvivorRatio=2"
# JVM GC activity logging settings
OPT_JVM_GC_LOG="-XX:+PrintTenuringDistribution -XX:+PrintGCDetails -XX:+PrintGCDateStamps -Xloggc:/var/log/gc.log"
java -Dlog4j.configuration=src/java/log4j.properties $OPT_JVM_SIZE $OPT_JVM_SIZE_NEW $OPT_JVM_GC_TYPE $OPT_JVM_GC_OPTS $OPT_JVM_GC_LOG -cp $CLASSPATH voldemort.server.VoldemortServer $@
<stores>
<!-- Note that "test" store requires 2 reads and writes,
so to use this store you must have both nodes started and running -->
<store>
<name>UserTable</name>
<persistence>bdb</persistence>
<routing>client</routing>
<replication-factor>1</replication-factor>
<required-reads>1</required-reads>
<required-writes>1</required-writes>
<preferred-reads>1</preferred-reads>
<preferred-writes>1</preferred-writes>
<key-serializer>
<type>string</type>
</key-serializer>
<value-serializer>
<type>string</type>
</value-serializer>
<retention-days>1</retention-days>
</store>
</stores>
# The ID of *this* particular cluster node
node.id=0
max.threads=100
enable.repair=true
data.directory=/data/voldemort
############### DB options ######################
http.enable=true
socket.enable=true
jmx.enable=true
admin.enable=true
# BDB
bdb.write.transactions=false
bdb.flush.transactions=false
bdb.cache.size=41200MB
bdb.btree.fanout=35536
voldemort.store.PersistenceFailureException: com.sleepycat.je.EnvironmentFailureException: (JE 4.1.17) /data/voldemort/bdb fetchTarget of 0x8c/0x1e35d5c parent IN=321698 IN class=com.sleepycat.je.tree.BIN lastFullVersion=0x12e/0xfbf1e4 parent.getDirty()=true state=0 java.lang.ArrayIndexOutOfBoundsException LOG_INTEGRITY: Log information is incorrect, problem is likely persistent.
at sun.reflect.GeneratedConstructorAccessor2.newInstance(Unknown Source)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:27)
at java.lang.reflect.Constructor.newInstance(Constructor.java:513)
at voldemort.utils.ReflectUtils.callConstructor(ReflectUtils.java:116)
at voldemort.utils.ReflectUtils.callConstructor(ReflectUtils.java:103)
at voldemort.store.ErrorCodeMapper.getError(ErrorCodeMapper.java:72)
at voldemort.client.protocol.vold.VoldemortNativeClientRequestFormat.checkException(VoldemortNativeClientRequestFormat.java:238)
at voldemort.client.protocol.vold.VoldemortNativeClientRequestFormat.readGetVersionResponse(VoldemortNativeClientRequestFormat.java:247)
at voldemort.store.socket.clientrequest.GetVersionsClientRequest.parseResponseInternal(GetVersionsClientRequest.java:53)
at voldemort.store.socket.clientrequest.GetVersionsClientRequest.parseResponseInternal(GetVersionsClientRequest.java:30)
at voldemort.store.socket.clientrequest.AbstractClientRequest.parseResponse(AbstractClientRequest.java:66)
at voldemort.store.socket.clientrequest.ClientRequestExecutorPool$NonblockingStoreCallbackClientRequest.parseResponse(ClientRequestExecutorPool.java:445)
at voldemort.store.socket.clientrequest.ClientRequestExecutor.read(ClientRequestExecutor.java:213)
at voldemort.common.nio.SelectorManagerWorker.run(SelectorManagerWorker.java:103)
at voldemort.common.nio.SelectorManager.run(SelectorManager.java:215)
at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:886)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:908)
at java.lang.Thread.run(Thread.java:662)
at sun.reflect.GeneratedConstructorAccessor2.newInstance(Unknown Source)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:27)
at java.lang.reflect.Constructor.newInstance(Constructor.java:513)
at voldemort.utils.ReflectUtils.callConstructor(ReflectUtils.java:116)
at voldemort.utils.ReflectUtils.callConstructor(ReflectUtils.java:103)
at voldemort.store.ErrorCodeMapper.getError(ErrorCodeMapper.java:72)
at voldemort.client.protocol.vold.VoldemortNativeClientRequestFormat.checkException(VoldemortNativeClientRequestFormat.java:238)
at voldemort.client.protocol.vold.VoldemortNativeClientRequestFormat.readGetVersionResponse(VoldemortNativeClientRequestFormat.java:247)
at voldemort.store.socket.clientrequest.GetVersionsClientRequest.parseResponseInternal(GetVersionsClientRequest.java:53)
at voldemort.store.socket.clientrequest.GetVersionsClientRequest.parseResponseInternal(GetVersionsClientRequest.java:30)
at voldemort.store.socket.clientrequest.AbstractClientRequest.parseResponse(AbstractClientRequest.java:66)
at voldemort.store.socket.clientrequest.ClientRequestExecutorPool$NonblockingStoreCallbackClientRequest.parseResponse(ClientRequestExecutorPool.java:445)
at voldemort.store.socket.clientrequest.ClientRequestExecutor.read(ClientRequestExecutor.java:213)
at voldemort.common.nio.SelectorManagerWorker.run(SelectorManagerWorker.java:103)
at voldemort.common.nio.SelectorManager.run(SelectorManager.java:215)
at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:886)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:908)
at java.lang.Thread.run(Thread.java:662)
voldemort.store.InsufficientOperationalNodesException: 1 get versionss required, but only 0 succeeded Original replication set :[3] Known failed nodes before operation :[] Estimated live nodes in preference list :[3] New failed nodes during operation :[]
at voldemort.store.routed.action.PerformSerialRequests.execute(PerformSerialRequests.java:132)
at voldemort.store.routed.Pipeline.execute(Pipeline.java:214)
at voldemort.store.routed.PipelineRoutedStore.getVersions(PipelineRoutedStore.java:530)
at voldemort.store.routed.PipelineRoutedStore.getVersions(PipelineRoutedStore.java:75)
at voldemort.store.DelegatingStore.getVersions(DelegatingStore.java:86)
at voldemort.store.DelegatingStore.getVersions(DelegatingStore.java:86)
at voldemort.store.serialized.SerializingStore.getVersions(SerializingStore.java:144)
at voldemort.store.DelegatingStore.getVersions(DelegatingStore.java:86)
at voldemort.client.DefaultStoreClient.getVersions(DefaultStoreClient.java:163)
at voldemort.client.DefaultStoreClient.put(DefaultStoreClient.java:344)
at voldemort.performance.benchmark.VoldemortWrapper$2.update(VoldemortWrapper.java:112)
at voldemort.client.DefaultStoreClient.applyUpdate(DefaultStoreClient.java:279)
at voldemort.client.DefaultStoreClient.applyUpdate(DefaultStoreClient.java:271)
at voldemort.client.LazyStoreClient.applyUpdate(LazyStoreClient.java:133)
at voldemort.performance.benchmark.VoldemortWrapper.write(VoldemortWrapper.java:107)
at voldemort.performance.benchmark.Workload.doWrite(Workload.java:399)
[14:36:36,029 voldemort.store.bdb.BdbStorageEngine] ERROR com.sleepycat.je.EnvironmentFailureException: (JE 4.1.17) /data/voldemort/bdb fetchTarget of 0x12d/0x182ae0f parent IN=17 IN class=com.sleepycat.je.tree.BIN lastFullVersion=0x12e/0x9ab9a2 parent.getDirty()=true state=0 java.lang.ArrayIndexOutOfBoundsException LOG_INTEGRITY: Log information is incorrect, problem is likely persistent. [voldemort-niosocket-server15]
[14:36:36,029 voldemort.server.protocol.vold.VoldemortNativeRequestHandler] ERROR com.sleepycat.je.EnvironmentFailureException: (JE 4.1.17) /data/voldemort/bdb fetchTarget of 0x12d/0x182ae0f parent IN=17 IN class=com.sleepycat.je.tree.BIN lastFullVersion=0x12e/0x9ab9a2 parent.getDirty()=true state=0 java.lang.ArrayIndexOutOfBoundsException LOG_INTEGRITY: Log information is incorrect, problem is likely persistent. [voldemort-niosocket-server15]
Hello,Well, I see a number a possible factors here. One is that you're running with an extraordinarily large JVM heap size and bdb.cache.size. GC stall time alone could be causing issues because it has so much memory to walk through. We have stores with tens of billions of keys and tens of thousands of QPS and we can get by easily with a 32gb heap (2g newgen, ~30g oldgen) and 20gb bdb.cache.size. How many keys and what kind of QPS are you expecting to have?
And, out of curiosity, why are you overriding the bdb.btree.fanout with such a high number? You might be inducing failure with that high of a btree fanout. I have never tested with anything higher than 1024 and I would generally recommend nothing higher than 512.
What kind of device and storage configuration is /data/voldemort?
Lastly, you should set the replication-factor of your stores to at least 2. You're obviously stress testing voldemort to see what kind of throughput it can do. So, you should make it more realistic, like you're going to run it in a production/live environment. You will probably want redundancy when you go live with it.I recommend running running the test again, but watching your disk I/O stats, system run queue averages, memory usage, jvm heap usage and gc stall during the process and see if those give you more information. Also, you should watch for disk write failures or overly long write operation times. That should give you more information to work with.
avg-cpu: %user %nice %system %iowait %steal %idle
1,39 0,00 0,62 1,26 0,00 96,74
Device: tps Blk_read/s Blk_wrtn/s Blk_read Blk_wrtn
sda 0,94 25,66 34,53 45829128 61683570
sdb 24,94 3992,46 5943,97 7131320730 10617094764
sdc 24,33 3976,38 5908,17 7102587945 10553139853
md0 1396,80 7968,83 11852,14 14233882899 21170234395
I thought that I should use all RAM for maximum performance. That is why I set 70 Gb for java heap and 40 Gb for bdb cache. I need to handle approximately 200 000 000 keys (~10kb each) with minimum 1000 ops/sec throughput
And, out of curiosity, why are you overriding the bdb.btree.fanout with such a high number? You might be inducing failure with that high of a btree fanout. I have never tested with anything higher than 1024 and I would generally recommend nothing higher than 512.Voldemort manual says that bigger value is better (http://www.project-voldemort.com/voldemort/configuration.html). I have a large amount of RAM, that is why I set these options so big.
What kind of device and storage configuration is /data/voldemort?2xhdd in RAID0
Lastly, you should set the replication-factor of your stores to at least 2. You're obviously stress testing voldemort to see what kind of throughput it can do. So, you should make it more realistic, like you're going to run it in a production/live environment. You will probably want redundancy when you go live with it.I recommend running running the test again, but watching your disk I/O stats, system run queue averages, memory usage, jvm heap usage and gc stall during the process and see if those give you more information. Also, you should watch for disk write failures or overly long write operation times. That should give you more information to work with.Thank you for your comments. I've configured voldemort with your suggestions. Write tests looks good now, but write speed is lower: 2000 ops/sec instead of 4000 ops/sec as I set replication-factor to 2.It should take 24 hours to fill in the whole 170000000 keys to database. IO wait on each node is approximately 1-2%.avg-cpu: %user %nice %system %iowait %steal %idle1,39 0,00 0,62 1,26 0,00 96,74Device: tps Blk_read/s Blk_wrtn/s Blk_read Blk_wrtnsda 0,94 25,66 34,53 45829128 61683570sdb 24,94 3992,46 5943,97 7131320730 10617094764sdc 24,33 3976,38 5908,17 7102587945 10553139853md0 1396,80 7968,83 11852,14 14233882899 21170234395
<stores>
<!-- Note that "test" store requires 2 reads and writes,
so to use this store you must have both nodes started and running -->
<store>
<name>UserTable</name>
<persistence>bdb</persistence>
<routing>client</routing>
<replication-factor>2</replication-factor>
Thank you once again for your detailed answers. They are very informative.I made RAID0 for maximum speed as I have simple SATA HDDs. I will try your configuration tomorrow, as I'm already uploading data to Voldemort cluster.Actually I have a small question about the database size. At the moment each node has approximately 570Gb database for 64'000'000 keys (each key size is 10kb). That means 616Gb data (64'000'000*10kb) becames 2280Gb (570Gb*4 nodes). I think 2280Gb is too large for 64'000'000 keys even with replication_factor=2. And it turns out that each 10kb key becomes 37kb. What do you think about that?
We have 5488 Read per sec.It means a lot of keys are fetched from cache ?
it there monitoring of Key Cache size ? Cache Hit ?
Hello, i've set --metric-type histogram and the output is:[status] Throughput(ops/sec): 2665.6716417910447 Operations: 2679[reads] Operations: 2679[reads] Average(ms): 3,1217[reads] Min(ms): 0[reads] Max(ms): 524[reads] Median(ms): 0[reads] 95th(ms): 14[reads] 99th(ms): 75[reads] Return: 0 2679Does it mean all reads return Zero ?
Brendan--
You received this message because you are subscribed to a topic in the Google Groups "project-voldemort" group.
To unsubscribe from this topic, visit https://groups.google.com/d/topic/project-voldemort/inDQGOom_rY/unsubscribe.
To unsubscribe from this group and all its topics, send an email to project-voldem...@googlegroups.com.
Visit this group at http://groups.google.com/group/project-voldemort.
For more options, visit https://groups.google.com/groups/opt_out.
THe problem is that the network Throughput is not corelated to Throughput(ops/sec)for example: the benchmark reports 4500 ops/sec, but network throuhput is only 6megaByte/sek. But the row size is 10240Bytes
Brendan--
the command is bin/voldemort-performance-tool.sh --value-size 10240 --ops-count 110000000 --url tcp://hm-4:6666 --store-name UserTable --threads 100 -v --interval 1 -r 100 --record-selection uniform --ignore-nulls --verify
PS: we are trying to find the solution for storing billion (10billions in the future) pictures of 10KB size. It must be low latency (less then 1sec user access) storage. There is no way to by SSD, but we have a few(tens) servers 96GBRam. Inspite of big RAM capacity the dataset can't fit at memory ;)
there is 3 SATA disk per server, and 2 of them are RAID0. The performance of the single sata drive is 130-150 Random Reads per second, so RAID0 is about 250 Randomreads per sec
All metada are kept at RAM, so when we are geting one image, there is 1 disk seek. So, there is might be 250 image read per sec for one server. We hope Voldemort can give us such performance :)
One more recommendation I have is that you reduce your total number of partitions to somewhere around 2000. Since you'll have 10s of billions of keys, don't go lower than 500 partitions. 2000 is probably good for you. One thing we have learned is that if you have too many partitions, running rebalances of the partitions to expand the cluster can take a _very_ long time. You want to have as few partitions as possible, but you still want to be able to avoid having overly hot partitions and be able to more evenly distribute load across the servers as partitions get hot.
./bin/generate_cluster_xml.py -f hosts -N cluster -p 1024 -S 937567216 -z 0
In my first message I copy-pasted the cluster config command:./bin/generate_cluster_xml.py -f hosts -N cluster -p 1024 -S 937567216 -z 0Do you mean we need set 2000 partitions for all cluster? In other words 500 per each node?
./voldemort-performance-tool.sh --value-size 10240 --ops-count 114359252 --url tcp://hm-4:6666 --store-name UserTable -r 100 --threads 100 -v --ierval 1 --ignore-nulls --request-file shuf --num-connections-per-node=100
Dear Brendan,Which option should I tune to increase cache size? Is it depend on java heap?
How should I calculate cache size for 100'000'000 keys? Will I get problems with GC when I will set -Xms70g -Xmx70g?
Hello, Brendan, our first goal is to fill metadata cache(to avoid 2 disk seek per read request), so we don't need so much heap. 16 GB per node I think is enough.
The question is: when bdb.cache stores metadata ? Does it store metadata in cache at write data operation or at read stage ?
If metadata are stored at read operation, we need some way to warm up metadata cache. Is there some way we can warm up the metadata cache ?
I'd like to thank you one more for answers!
--
I was using mongoperf utility to test our sata drive.
There is 120-150 random read requests at 500GB file, 10KB blocksize
so I'm expecting to get 200-250 random read per voldemort server
I'm wright or there is some mistake ?
Thank you
--
Hello, Brendan, our first goal is to fill metadata cache(to avoid 2 disk seek per read request), so we don't need so much heap. 16 GB per node I think is enough.
The question is: when bdb.cache stores metadata ? Does it store metadata in cache at write data operation or at read stage ?
If metadata are stored at read operation, we need some way to warm up metadata cache. Is there some way we can warm up the metadata cache ?
I was using mongoperf utility to test our sata drive.
There is 120-150 random read requests at 500GB file, 10KB blocksize
so I'm expecting to get 200-250 random read per voldemort server
I'm wright or there is some mistake ?
He tries to say that one single SATA drive gives 120-150 random read requests. So when we use RAID0 we should get ~ x1.5 speed increase.That mean that one node should give ~200-250 requests per second and whole cluster (4 nodes) should give ~800-1000 requests per second.
<stores>
<!-- Note that "test" store requires 2 reads and writes,
so to use this store you must have both nodes started and running -->
<store>
<name>UserTable</name>
<persistence>bdb</persistence>
<routing>client</routing>
<replication-factor>1</replication-factor>
<required-reads>1</required-reads>
<required-writes>1</required-writes>
<preferred-reads>1</preferred-reads>
<preferred-writes>1</preferred-writes>
<key-serializer>
<type>string</type>
</key-serializer>
<value-serializer>
<type>string</type>
</value-serializer>
<retention-days>1</retention-days>
</store>
</stores>
<!-- Partition distribution generated using seed [32819401880] -->
<cluster>
<name>voldemort</name>
<server>
<id>0</id>
<host>hm-4</host>
<http-port>6665</http-port>
<socket-port>6666</socket-port>
<admin-port>6667</admin-port>
<partitions>0, 1, 3, 5, 7, 9, 12, 13, 15, 16, 17, 18, 26, 27, 28, 29, 30, 32, 33, 34, 35, 36, 43, 47, 48, 49, 50, 52, 55, 57, 58, 66, 71, 72, 77, 78, 80, 81, 85, 86, 87, 89, 98, 101, 102, 104, 105, 109, 113, 114, 115, 116, 117, 118, 120, 121, 123, 124, 125, 130, 131, 132, 134, 137, 139, 140, 142, 143, 144, 147, 153, 157, 160, 161, 165, 167, 168, 170, 171, 174, 175, 178, 184, 185, 186, 187, 188, 189, 195, 196, 198, 201, 204, 207, 208, 209, 212, 214, 216, 217, 219, 222, 226, 229, 230, 231, 232, 235, 236, 239, 240, 242, 245, 246, 247, 248, 256, 257, 258, 259, 262, 264, 265, 266, 268, 271, 273, 276, 278, 279, 280, 285, 286, 287, 289, 290, 291, 292, 294, 295, 296, 297, 306, 309, 312, 314, 315, 317, 321, 322, 323, 324, 326, 327, 328, 329, 330, 332, 334, 336, 337, 338, 339, 342, 343, 345, 346, 347, 349, 350, 352, 353, 354, 356, 362, 364, 365, 366, 367, 370, 372, 373, 374, 375, 381, 383, 386, 387, 388, 389, 395, 398, 399, 403, 407, 410, 412, 414, 416, 418, 419, 423, 424, 426, 428, 429, 432, 433, 434, 435, 436, 437, 438, 440, 445, 446, 448, 449, 452, 453, 454, 456, 457, 459, 461, 462, 463, 465, 466, 467, 469, 472, 473, 476, 477, 478, 479, 480, 485, 487, 489, 492, 493, 494, 495, 496, 497, 498, 503, 505, 507, 509, 510, 514, 516, 517, 518, 519, 521, 522, 523, 527, 528, 532, 533, 534, 535, 539, 542, 545, 546, 547, 549, 550, 554, 555, 556, 557, 558, 559, 560, 561, 564, 565, 566, 568, 570, 571, 572, 574, 575, 577, 578, 580, 581, 588, 591, 593, 594, 596</partitions>
</server>
<server>
<id>1</id>
<host>hm-4</host>
<http-port>6668</http-port>
<socket-port>6669</socket-port>
<admin-port>6670</admin-port>
<partitions>2, 4, 6, 8, 10, 11, 14, 19, 20, 21, 22, 23, 24, 25, 31, 37, 38, 39, 40, 41, 42, 44, 45, 46, 51, 53, 54, 56, 59, 60, 61, 62, 63, 64, 65, 67, 68, 69, 70, 73, 74, 75, 76, 79, 82, 83, 84, 88, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100, 103, 106, 107, 108, 110, 111, 112, 119, 122, 126, 127, 128, 129, 133, 135, 136, 138, 141, 145, 146, 148, 149, 150, 151, 152, 154, 155, 156, 158, 159, 162, 163, 164, 166, 169, 172, 173, 176, 177, 179, 180, 181, 182, 183, 190, 191, 192, 193, 194, 197, 199, 200, 202, 203, 205, 206, 210, 211, 213, 215, 218, 220, 221, 223, 224, 225, 227, 228, 233, 234, 237, 238, 241, 243, 244, 249, 250, 251, 252, 253, 254, 255, 260, 261, 263, 267, 269, 270, 272, 274, 275, 277, 281, 282, 283, 284, 288, 293, 298, 299, 300, 301, 302, 303, 304, 305, 307, 308, 310, 311, 313, 316, 318, 319, 320, 325, 331, 333, 335, 340, 341, 344, 348, 351, 355, 357, 358, 359, 360, 361, 363, 368, 369, 371, 376, 377, 378, 379, 380, 382, 384, 385, 390, 391, 392, 393, 394, 396, 397, 400, 401, 402, 404, 405, 406, 408, 409, 411, 413, 415, 417, 420, 421, 422, 425, 427, 430, 431, 439, 441, 442, 443, 444, 447, 450, 451, 455, 458, 460, 464, 468, 470, 471, 474, 475, 481, 482, 483, 484, 486, 488, 490, 491, 499, 500, 501, 502, 504, 506, 508, 511, 512, 513, 515, 520, 524, 525, 526, 529, 530, 531, 536, 537, 538, 540, 541, 543, 544, 548, 551, 552, 553, 562, 563, 567, 569, 573, 576, 579, 582, 583, 584, 585, 586, 587, 589, 590, 592, 595, 597, 598, 599</partitions>
</server>
</cluster>
# The ID of *this* particular cluster node
node.id=0
max.threads=100
enable.repair=true
data.directory=/data1/voldemort
############### DB options ######################
http.enable=true
socket.enable=true
jmx.enable=true
admin.enable=true
# BDB
bdb.write.transactions=false
bdb.flush.transactions=false
bdb.cache.size=10g
bdb.minimize.scan.impact=false
# The ID of *this* particular cluster node
node.id=1
max.threads=100
enable.repair=true
data.directory=/data2/voldemort
############### DB options ######################
http.enable=true
socket.enable=true
jmx.enable=true
admin.enable=true
# BDB
bdb.write.transactions=false
bdb.flush.transactions=false
bdb.cache.size=10g
bdb.minimize.scan.impact=false
sync
echo 3 > /proc/sys/vm/drop_caches
avg-cpu: %user %nice %system %iowait %steal %idle
0,16 0,00 0,31 15,39 0,00 84,14
Device: tps Blk_read/s Blk_wrtn/s Blk_read Blk_wrtn
sda 0,00 0,00 0,00 0 0
sdb 240,00 13120,00 0,00 13120 0
sdc 293,00 12272,00 0,00 12272 0