Triggering a batchjob like process on vert.x JVM for 10 minutes and I observe a lot of Timeout errors. Also I have captured the couchbase tracer logs.
Connection is from a process running on laptop to a remote couchbase server. There are no other network equipment like firewall etc. in middle.
OPR_TOUT below is 3 seconds and I keep getting timeouts in random.
code snippet ->. I am NOT SURE the code is correct. Could u help validate its ok?
// called in a loop for 1 user at a time for 300 users by a Fetch_Batch
JsonDocument doc = cbBucketAccess.get(docKey).toSingle().toBlocking().value();
CBBucketAccess.java has ->
public Observable<JsonDocument> get(String documentKey) {
return Observable.create(subscriber -> {
try {
getFromPrimary(subscriber, documentKey);
} catch (Exception ex) {
subscriber.onError(ex);
}
}, Emitter.BackpressureMode.NONE);
}
private void getFromPrimary(Emitter<JsonDocument> subscriber, String documentKey) {
log.info("InsideGetFromPrimary id="+documentKey);
Stopwatch stopwatch = Stopwatch.createStarted();
Span span = TracerUtils.defaultTracer().activeSpan();
pBucket.get(documentKey)
.timeout(pBucketConfig.getLong(OPR_TOUT), TimeUnit.MILLISECONDS)
.onErrorResumeNext(exception -> {
try (Scope ignored1 = TracerUtils.defaultTracer().activateSpan(span)) {
log.error(" onErrorResumeNext-exception id="+documentKey, exception );
if(null != exception.getCause()){
log.error(" onErrorResumeNext-exception-cause id="+documentKey, exception.getCause() );
}
return getFromReplica(documentKey);
}
}
)
.singleOrDefault(null)
.subscribe(new TracingAction1<>(jsonDocument -> {
log.info(" InsideGetFromPrimary-SuccessResponseTime=" + Long.toString(stopwatch.elapsed(TimeUnit.MILLISECONDS)));
if(null != jsonDocument){
subscriber.onNext(jsonDocument);
}
subscriber.onCompleted();
}, TracerUtils.defaultTracer()), new TracingAction1<>(throwable -> {
log.info(" InsideGetFromPrimary-FailureResponseTime=" + Long.toString(stopwatch.elapsed(TimeUnit.MILLISECONDS)));
subscriber.onError(throwable);
}, TracerUtils.defaultTracer()));
}
private Observable<JsonDocument> getFromReplica(String documentKey) {
log.info("InsideGetFromReplica id="+documentKey);
Span span = TracerUtils.defaultTracer().activeSpan();
return pBucket.getFromReplica(documentKey, ReplicaMode.FIRST)
.timeout(pBucketConfig.getLong(OPR_TOUT), TimeUnit.MILLISECONDS)
.onErrorResumeNext(exception -> {
log.error("getFromReplica: onErrorResumeNext id="+documentKey, exception);
return Observable.error(exception);
});
}
here is a part of the log file ->
"traceId":"2d2182904426c76177c2ad84d3000000","logType":"ELF","fileName":"com.X.awm.common.data.BucketAccess","schemaVersion":"1.0.0","method":"getFromPrimary","level":"INFO","message":"InsideGetFromPrimary id=wm.UsrAcc::1528__10227364","type":"log","tags":{"baggage.Request-Id":"dbop.19feb.e0perf-1"},"spanId":"177c2ad84d3de500","environment":{"name":"e0","host":"XXX10S9HTD6","cluster":null},"application":{"name":"wminternal","namespace":"axp-wm-events-processing","version":null,"carId":600000579},"timestampInMillis":1613879058405,"logId":"5070ca61f41c38011c20177c2b01fe5000000002","loggerName":"com.X.awm.common.data.BucketAccess","lineNumber":754,"operation":"dbop consumer","timestamp":"2021-02-21T03:44:18.405Z"}
19:44:21.408 [ RxComputationScheduler-2] [ERROR] - [ com.X.awm.common.data.BucketAccess:] -- [] -- onErrorResumeNext-exception id=wm.UsrAcc::1528__10227364
java.util.concurrent.TimeoutException: null
at rx.internal.operators.OnSubscribeTimeoutTimedWithFallback$TimeoutMainSubscriber.onTimeout(OnSubscribeTimeoutTimedWithFallback.java:166) [rxjava-1.3.8.jar:1.3.8]
at rx.internal.operators.OnSubscribeTimeoutTimedWithFallback$TimeoutMainSubscriber$TimeoutTask.call(OnSubscribeTimeoutTimedWithFallback.java:191) [rxjava-1.3.8.jar:1.3.8]
at rx.internal.schedulers.EventLoopsScheduler$EventLoopWorker$2.call(EventLoopsScheduler.java:189) [rxjava-1.3.8.jar:1.3.8]
at rx.internal.schedulers.ScheduledAction.run(ScheduledAction.java:55) [rxjava-1.3.8.jar:1.3.8]
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) [?:1.8.0_181]
at java.util.concurrent.FutureTask.run(FutureTask.java:266) [?:1.8.0_181]
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180) [?:1.8.0_181]
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293) [?:1.8.0_181]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_181]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_181]
at java.lang.Thread.run(Thread.java:748) [?:1.8.0_181]
19:44:21.408 [ RxComputationScheduler-2] [INFO ] - [ com.X.awm.common.data.BucketAccess:] -- [] -- InsideGetFromReplica id=wm.UsrAcc::1528__10227364
{"traceId":"2d2182904426c76177c2ad84d3000000","logType":"ELF","fileName":"com.X.awm.common.data.BucketAccess","schemaVersion":"1.0.0","method":"getFromReplica","level":"INFO","message":"InsideGetFromReplica id=wm.UsrAcc::1528__10227364","type":"log","tags":{"baggage.Request-Id":"dbop.19feb.e0perf-1"},"spanId":"177c2ad84d3de500","environment":{"name":"e0","host":"XXX10S9HTD6","cluster":null},"application":{"name":"wminternal","namespace":"axp-wm-events-processing","version":null,"carId":600000579},"timestampInMillis":1613879061408,"logId":"5070ca61f41c38011c20177c2b02ba0000000001","loggerName":"com.X.awm.common.data.BucketAccess","lineNumber":789,"operation":"dbop consumer","timestamp":"2021-02-21T03:44:21.408Z"}
{"traceId":"2d2182904426c76177c2ad84d3000000","logType":"ELF","fileName":"com.X.awm.common.data.BucketAccess","schemaVersion":"1.0.0","method":"lambda$getFromPrimary$45","level":"INFO","message":" InsideGetFromPrimary-SuccessResponseTime=3041","type":"log","tags":{"baggage.Request-Id":"dbop.19feb.e0perf-1"},"spanId":"177c2ad84d3de500","environment":{"name":"e0","host":"XXX10S9HTD6","cluster":null},"application":{"name":"wminternal","namespace":"axp-wm-events-processing","version":null,"carId":600000579},"timestampInMillis":1613879061447,"logId":"5070ca61f41c38011c20177c2b02bc7000000000","loggerName":"com.X.awm.common.data.BucketAccess","lineNumber":777,"operation":"dbop consumer","timestamp":"2021-02-21T03:44:21.447Z"}
19:44:21.447 [ cb-computations-7] [INFO ] - [ com.X.awm.common.data.BucketAccess:] -- [] -- InsideGetFromPrimary-SuccessResponseTime=3041
19:44:21.448 [ cb-computations-2] [INFO ] - [ com.X.awm.internal.service.dbop.savings.FetchJobService:] -- [] -- updtUserAcctLastSnapshotTs success for user 1528__10227364
19:44:21.448 [ cb-computations-2] [INFO ] - [ com.X.awm.internal.service.dbop.savings.FetchJobService:] -- [] -- dbop.19feb.e0perf-1: Fetch_Batch callDBOPExecModule consumerNumber=1528__10227364 time-elapsed=3043 sleep=-2043
19:44:21.448 [ cb-computations-2] [INFO ] - [ com.X.awm.internal.service.dbop.savings.FetchJobService:] -- [] -- dbop.19feb.e0perf-1: Fetch_Batch callDBOPExecModule end consumerNumber=1528__10227364
19:44:21.449 [ cb-computations-2] [INFO ] - [ com.X.awm.internal.service.dbop.savings.FetchJobService:] -- [] -- dbop.19feb.e0perf-1: Fetch_Batch : loadAndProcessBatchData End : failure count 1
19:44:21.449 [ cb-computations-2] [INFO ] - [ com.X.awm.internal.service.dbop.savings.FetchJobService:] -- [] -- dbop.19feb.e0perf-1: Fetch_Batch : loadAndProcessBatchData Start : failure count {}1
19:44:21.449 [ cb-computations-2] [INFO ] - [ com.X.awm.internal.service.dbop.savings.FetchJobService:] -- [] -- dbop.19feb.e0perf-1: Fetch_Batch : loadAndProcessBatchData loadPageData Start
19:44:21.449 [ cb-computations-2] [INFO ] - [ com.X.awm.common.data.UserAcctInfoDataAccess:] -- [] -- Reading savings accounts from DB: paginated for offset 0 and limit 30
19:44:21.450 [ cb-computations-2] [INFO ] - [ com.X.awm.common.data.UserAcctInfoDataAccess:] -- [] -- DB Query :: SimpleN1qlQuery{statement=select * from `WLTHMGT` AS b UNNEST b.payoutAccounts AS pa WHERE b.type = 'wm.UsrAcc' AND b.createdTs < '2021-02-20T00:00:00.000Z' AND pa.tenantCd = 'SAV' ORDER BY pa.lastSnapshotTs OFFSET 0 LIMIT 30}
{"traceId":"2d2182904426c76177c2ad84d3000000","logType":"ELF","fileName":"com.X.awm.common.data.BucketAccess","schemaVersion":"1.0.0","method":"queryFromPrimary","level":"INFO","message":"InsideQueryFromPrimary","type":"log","tags":{"baggage.Request-Id":"dbop.19feb.e0perf-1"},"spanId":"177c2ad84d3de500","environment":{"name":"e0","host":"XXX10S9HTD6","cluster":null},"application":{"name":"wminternal","namespace":"axp-wm-events-processing","version":null,"carId":600000579},"timestampInMillis":1613879061450,"logId":"5070ca61f41c38011c20177c2b02bca000000001","loggerName":"com.X.awm.common.data.BucketAccess","lineNumber":1345,"operation":"dbop consumer","timestamp":"2021-02-21T03:44:21.450Z"}
19:44:21.450 [ cb-computations-2] [INFO ] - [ com.X.awm.common.data.BucketAccess:] -- [] -- InsideQueryFromPrimary
19:44:22.224 [ cb-tracing-1] [WARN ] - [ com.couchbase.client.core.tracing.ThresholdLogReporter:] -- [] -- Operations over threshold: [ {
"top" : [ {
"operation_name" : "get_replica",
"last_local_id" : "1E5598A52D593EF4/FFFFFFFFEAA91BDC",
"last_local_address" : "10.xx.xx.xxx:56564",
"last_remote_address" : "10.x.xxx.174:11210",
"last_dispatch_us" : 37062,
"decode_us" : 94,
"last_operation_id" : "0x425",
"total_us" : 38393
} ],
"service" : "kv",
"count" : 1
} ]