import com.arangodb.ArangoDB;
import com.arangodb.ArangoGraph;
import com.arangodb.entity.BaseDocument;
import com.arangodb.entity.BaseEdgeDocument;
import com.arangodb.entity.EdgeDefinition;
import com.arangodb.entity.StreamTransactionEntity;
import com.arangodb.model.EdgeCreateOptions;
import com.arangodb.model.StreamTransactionOptions;
import com.arangodb.model.VertexCreateOptions;
import java.util.Collections;
import java.util.Date;
import java.util.UUID;
public class Test {
static String DATABASE = "mydb";
static String GRAPH = "mygraph";
public static void main(String[] args) {
ArangoDB arangoDB = new ArangoDB.Builder()
.host("localhost", 8529)
.build();
if (arangoDB.db(DATABASE).exists()) {
arangoDB.db(DATABASE).drop();
}
arangoDB.db(DATABASE).create();
arangoDB.db(DATABASE).createCollection("MY_test_vertex_from1");
arangoDB.db(DATABASE).createCollection("MY_test_vertex_from2");
arangoDB.db(DATABASE).createCollection("MY_test_vertex_to");
arangoDB.db(DATABASE).createGraph(GRAPH, Collections.singletonList(new EdgeDefinition()
.collection("MY_test_edge")
.from("MY_test_vertex_from1", "MY_test_vertex_from2")
.to("MY_test_vertex_to")
));
int iterations = 1_000;
// warmup
for (int i = 0; i < iterations; i++) {
String from1 = "from1-" + UUID.randomUUID().toString();
String from2 = "from2-" + UUID.randomUUID().toString();
String to = "to-" + UUID.randomUUID().toString();
addNodes(arangoDB, from1, from2, to);
addEdge(arangoDB, from1, from2, to);
}
long start = new Date().getTime();
for (int i = 0; i < iterations; i++) {
String from1 = "from1-" + UUID.randomUUID().toString();
String from2 = "from2-" + UUID.randomUUID().toString();
String to = "to-" + UUID.randomUUID().toString();
addNodes(arangoDB, from1, from2, to);
addEdge(arangoDB, from1, from2, to);
}
long end = new Date().getTime();
long elapsed = end - start;
System.out.println("elapsed: " + elapsed + " ms");
System.out.println("avg: " + (1.0 * elapsed / iterations) + " ms");
arangoDB.shutdown();
}
private static void addEdge(ArangoDB arangoDB, String from1, String from2, String to) {
ArangoGraph graph = arangoDB.db(DATABASE).graph(GRAPH);
String[] collections = new String[]{"MY_test_edge"};
StreamTransactionEntity tx = graph.db().beginStreamTransaction(
new StreamTransactionOptions()
.waitForSync(false)
.writeCollections(collections));
EdgeCreateOptions options = new EdgeCreateOptions()
.streamTransactionId(tx.getId())
.waitForSync(false);
try {
BaseEdgeDocument edge = new BaseEdgeDocument("MY_test_vertex_from1/" + from1, "MY_test_vertex_to/" + to);
graph.edgeCollection("MY_test_edge").insertEdge(edge, options);
edge = new BaseEdgeDocument("MY_test_vertex_from2/" + from2, "MY_test_vertex_to/" + to);
graph.edgeCollection("MY_test_edge").insertEdge(edge, options);
graph.db().commitStreamTransaction(tx.getId());
} catch (Exception e) {
graph.db().abortStreamTransaction(tx.getId());
throw e;
}
}
private static void addNodes(ArangoDB arangoDB, String from1, String from2, String to) {
ArangoGraph graph = arangoDB.db(DATABASE).graph(GRAPH);
String[] collections = new String[]{"MY_test_vertex_from1", "MY_test_vertex_from2", "MY_test_vertex_to"};
StreamTransactionEntity tx = graph.db().beginStreamTransaction(
new StreamTransactionOptions()
.waitForSync(false)
.writeCollections(collections));
VertexCreateOptions options = new VertexCreateOptions()
.streamTransactionId(tx.getId())
.waitForSync(false);
try {
graph.vertexCollection("MY_test_vertex_from1").insertVertex(new BaseDocument(from1), options);
graph.vertexCollection("MY_test_vertex_from2").insertVertex(new BaseDocument(from2), options);
graph.vertexCollection("MY_test_vertex_to").insertVertex(new BaseDocument(to), options);
graph.db().commitStreamTransaction(tx.getId());
} catch (Exception e) {
e.printStackTrace();
graph.db().abortStreamTransaction(tx.getId());
throw e;
}
}
}
> We have written a test harness to evaluate performance of a number of graph alternatives.
> The original snippet of code is not part of the harness, but was an example of how we are
> adding data through the java driver. Because we are currently using neo4j, that was the
> initial implementation for that harness. The test consists of adding 2M nodes using
> batches/transactions of 500. Tests are being run initially on dev laptops with the end goal
> of running all of the tests on a single, more powerful environment. With neo, we are able
> to add the 2M nodes in roughly 6-7 minutes. With Arangodb, we are in the neighborhood
> of 35-40 minutes for the same data so as you can see, this is a pretty dramatic difference.
Hi Rob,
there are different approaches to improve the performance considerably.
(1) Single Document Operation
The initial program you provided will use a single document operation for each vertex and edge that will be inserted. This is based on the synchronous driver so that it will not run in parallel.
In order to make better use of the server, you can use threads in Java to create the vertices and edges in parallel. Also, in his example program, we raised the transaction size to 500.
You can find Michele’s version here: https://gist.github.com/rashtao/831c7e0281314789a2e2b57e8b3bfe67
This is just a proof of concept and not production code quality.
With this setup we get on a laptop:
1200000 vertexes
800000 edges
elapsed: 195275 ms
10241 insertions/s
This is roughly 10x faster than your numbers. Obviously, this is not the same test environment, but the laptop we used is not the fastest.
The drawbacks of this approach are a lot of communication between the client and the server. To fully make use of batches the following approach will help.
(2) Insert using AQL
You can use AQL to insert batches of vertices and edges. Michele’s example program can be found here: https://gist.github.com/rashtao/5b72b6187d1a6b50aa129a9f3c5fb2ef
With this version we reach the following numbers (on the same laptop as above):
1200000 vertexes
800000 edges
elapsed: 72617 ms
27541 insertions/s
That is a factor of ~2.5 faster than the previous approach. With this setup, you can import 2 million documents in 1min12sec.
Please note, that if you use much larger transaction sizes, you should enable intermediate commits, see https://www.arangodb.com/docs/3.6/transactions-limitations.html#rocksdb-storage-engine
(3) Batch Generation of Documents
We also provide a specialized API for generating batches of documents. This can be used as an alternative to (2) and allow you to gain even more performance. For example, see https://gist.github.com/rashtao/22a43ba5233669d610eca65e06bc7b87
This gives
1200000 vertexes
800000 edges
elapsed: 64428 ms
31042 insertions/s
This is slightly faster than using AQL.
(4) Import
There is also a special API for bulk imports. However, this does not support transactions (see https://www.arangodb.com/docs/stable/http/bulk-imports.html). We can provide more details if required.
(5) Outlook
We are also working on the next version of the Java driver. This will be reactive and non-blocking on the network side. It will use fewer threads and will do auto-tuning of the parallelism in the client. This will make (1) even easier to implement.
If you have any further questions please do not hesitate to ask. Alternatively, we can set up a call to discuss the various options.
Michele & Frank