Triggering a batchjob like process on vert.x JVM for 10 minutes and I observe a lot of Timeout errors. Also I have captured the couchbase tracer logs.
Connection is from a process running on laptop to a remote couchbase server. There are no other network equipment like firewall etc. in middle.
OPR_TOUT below is 3 seconds and I keep getting timeouts in random.
code snippet ->. I am NOT SURE the code is correct. Could u help validate its ok?
  // called in a loop for 1 user at a time for 300 users by a Fetch_Batch
  JsonDocument doc = cbBucketAccess.get(docKey).toSingle().toBlocking().value();
CBBucketAccess.java has ->
public Observable<JsonDocument> get(String documentKey) {
 Â
log.info("InsideGet id="+documentKey);
  return Observable.create(subscriber -> {
    try {
      getFromPrimary(subscriber, documentKey);
    } catch (Exception ex) {
      subscriber.onError(ex);
    }
  }, Emitter.BackpressureMode.NONE);
}
private void getFromPrimary(Emitter<JsonDocument> subscriber, String documentKey) {
 Â
log.info("InsideGetFromPrimary id="+documentKey);
  Stopwatch stopwatch = Stopwatch.createStarted();
  Span span = TracerUtils.defaultTracer().activeSpan();
  pBucket.get(documentKey)
      .timeout(pBucketConfig.getLong(OPR_TOUT), TimeUnit.MILLISECONDS)
      .onErrorResumeNext(exception -> {
            try (Scope ignored1 = TracerUtils.defaultTracer().activateSpan(span)) {
              log.error(" onErrorResumeNext-exception id="+documentKey, exception );
              if(null != exception.getCause()){
                log.error(" onErrorResumeNext-exception-cause id="+documentKey, exception.getCause() );
              }
              return getFromReplica(documentKey);
            }
          }
      )
      .singleOrDefault(null)
      .subscribe(new TracingAction1<>(jsonDocument -> {
       Â
log.info(" InsideGetFromPrimary-SuccessResponseTime=" + Long.toString(stopwatch.elapsed(TimeUnit.MILLISECONDS)));
        if(null != jsonDocument){
          subscriber.onNext(jsonDocument);
        }
        subscriber.onCompleted();
      }, TracerUtils.defaultTracer()), new TracingAction1<>(throwable -> {
       Â
log.info(" InsideGetFromPrimary-FailureResponseTime=" + Long.toString(stopwatch.elapsed(TimeUnit.MILLISECONDS)));
        subscriber.onError(throwable);
      }, TracerUtils.defaultTracer()));
}
private Observable<JsonDocument> getFromReplica(String documentKey) {
 Â
log.info("InsideGetFromReplica id="+documentKey);
  Span span = TracerUtils.defaultTracer().activeSpan();
  return pBucket.getFromReplica(documentKey, ReplicaMode.FIRST)
      .timeout(pBucketConfig.getLong(OPR_TOUT), TimeUnit.MILLISECONDS)
      .onErrorResumeNext(exception -> {
        log.error("getFromReplica: onErrorResumeNext id="+documentKey, exception);
        return Observable.error(exception);
      });
}
here is a part of the log file ->
"traceId":"2d2182904426c76177c2ad84d3000000","logType":"ELF","fileName":"com.X.awm.common.data.BucketAccess","schemaVersion":"1.0.0","method":"getFromPrimary","level":"INFO","message":"InsideGetFromPrimary id=wm.UsrAcc::1528__10227364","type":"log","tags":{"baggage.Request-Id":"dbop.19feb.e0perf-1"},"spanId":"177c2ad84d3de500","environment":{"name":"e0","host":"XXX10S9HTD6","cluster":null},"application":{"name":"wminternal","namespace":"axp-wm-events-processing","version":null,"carId":600000579},"timestampInMillis":1613879058405,"logId":"5070ca61f41c38011c20177c2b01fe5000000002","loggerName":"com.X.awm.common.data.BucketAccess","lineNumber":754,"operation":"dbop consumer","timestamp":"2021-02-21T03:44:18.405Z"}
19:44:21.408Â [Â Â Â RxComputationScheduler-2] [ERROR] - [Â Â Â Â Â Â Â Â Â Â Â Â com.X.awm.common.data.BucketAccess:] -- [] --Â onErrorResumeNext-exception id=wm.UsrAcc::1528__10227364
java.util.concurrent.TimeoutException: null
at rx.internal.operators.OnSubscribeTimeoutTimedWithFallback$TimeoutMainSubscriber.onTimeout(OnSubscribeTimeoutTimedWithFallback.java:166) [rxjava-1.3.8.jar:1.3.8]
at rx.internal.operators.OnSubscribeTimeoutTimedWithFallback$TimeoutMainSubscriber$TimeoutTask.call(OnSubscribeTimeoutTimedWithFallback.java:191) [rxjava-1.3.8.jar:1.3.8]
at rx.internal.schedulers.EventLoopsScheduler$EventLoopWorker$2.call(EventLoopsScheduler.java:189) [rxjava-1.3.8.jar:1.3.8]
at rx.internal.schedulers.ScheduledAction.run(ScheduledAction.java:55) [rxjava-1.3.8.jar:1.3.8]
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) [?:1.8.0_181]
at java.util.concurrent.FutureTask.run(FutureTask.java:266) [?:1.8.0_181]
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180) [?:1.8.0_181]
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293) [?:1.8.0_181]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_181]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_181]
at java.lang.Thread.run(Thread.java:748) [?:1.8.0_181]
19:44:21.408Â [Â Â Â RxComputationScheduler-2] [INFO ] - [Â Â Â Â Â Â Â Â Â Â Â Â com.X.awm.common.data.BucketAccess:] -- [] -- InsideGetFromReplica id=wm.UsrAcc::1528__10227364
{"traceId":"2d2182904426c76177c2ad84d3000000","logType":"ELF","fileName":"com.X.awm.common.data.BucketAccess","schemaVersion":"1.0.0","method":"getFromReplica","level":"INFO","message":"InsideGetFromReplica id=wm.UsrAcc::1528__10227364","type":"log","tags":{"baggage.Request-Id":"dbop.19feb.e0perf-1"},"spanId":"177c2ad84d3de500","environment":{"name":"e0","host":"XXX10S9HTD6","cluster":null},"application":{"name":"wminternal","namespace":"axp-wm-events-processing","version":null,"carId":600000579},"timestampInMillis":1613879061408,"logId":"5070ca61f41c38011c20177c2b02ba0000000001","loggerName":"com.X.awm.common.data.BucketAccess","lineNumber":789,"operation":"dbop consumer","timestamp":"2021-02-21T03:44:21.408Z"}
{"traceId":"2d2182904426c76177c2ad84d3000000","logType":"ELF","fileName":"com.X.awm.common.data.BucketAccess","schemaVersion":"1.0.0","method":"lambda$getFromPrimary$45","level":"INFO","message":" InsideGetFromPrimary-SuccessResponseTime=3041","type":"log","tags":{"baggage.Request-Id":"dbop.19feb.e0perf-1"},"spanId":"177c2ad84d3de500","environment":{"name":"e0","host":"XXX10S9HTD6","cluster":null},"application":{"name":"wminternal","namespace":"axp-wm-events-processing","version":null,"carId":600000579},"timestampInMillis":1613879061447,"logId":"5070ca61f41c38011c20177c2b02bc7000000000","loggerName":"com.X.awm.common.data.BucketAccess","lineNumber":777,"operation":"dbop consumer","timestamp":"2021-02-21T03:44:21.447Z"}
19:44:21.447Â [Â Â Â Â Â Â Â cb-computations-7] [INFO ] - [Â Â Â Â Â Â Â Â Â Â Â Â com.X.awm.common.data.BucketAccess:] -- [] --Â InsideGetFromPrimary-SuccessResponseTime=3041
19:44:21.448Â [Â Â Â Â Â Â Â cb-computations-2] [INFO ] - [Â com.X.awm.internal.service.dbop.savings.FetchJobService:] -- [] -- updtUserAcctLastSnapshotTs success for user 1528__10227364
19:44:21.448Â [Â Â Â Â Â Â Â cb-computations-2] [INFO ] - [Â com.X.awm.internal.service.dbop.savings.FetchJobService:] -- [] -- dbop.19feb.e0perf-1: Fetch_Batch callDBOPExecModule consumerNumber=1528__10227364 time-elapsed=3043 sleep=-2043
19:44:21.448Â [Â Â Â Â Â Â Â cb-computations-2] [INFO ] - [Â com.X.awm.internal.service.dbop.savings.FetchJobService:] -- [] -- dbop.19feb.e0perf-1: Fetch_Batch callDBOPExecModule end consumerNumber=1528__10227364
19:44:21.449Â [Â Â Â Â Â Â Â cb-computations-2] [INFO ] - [Â com.X.awm.internal.service.dbop.savings.FetchJobService:] -- [] -- dbop.19feb.e0perf-1: Fetch_Batch : loadAndProcessBatchData End : failure count 1
19:44:21.449Â [Â Â Â Â Â Â Â cb-computations-2] [INFO ] - [Â com.X.awm.internal.service.dbop.savings.FetchJobService:] -- [] -- dbop.19feb.e0perf-1: Fetch_Batch : loadAndProcessBatchData Start : failure count {}1
19:44:21.449Â [Â Â Â Â Â Â Â cb-computations-2] [INFO ] - [Â com.X.awm.internal.service.dbop.savings.FetchJobService:] -- [] -- dbop.19feb.e0perf-1: Fetch_Batch : loadAndProcessBatchData loadPageData Start
19:44:21.449 [       cb-computations-2] [INFO ] - [       com.X.awm.common.data.UserAcctInfoDataAccess:] -- [] -- Reading savings accounts from DB: paginated for offset 0 and limit 30
19:44:21.450 [       cb-computations-2] [INFO ] - [       com.X.awm.common.data.UserAcctInfoDataAccess:] -- [] -- DB Query :: SimpleN1qlQuery{statement=select * from `WLTHMGT` AS b UNNEST b.payoutAccounts AS pa WHERE b.type = 'wm.UsrAcc' AND b.createdTs < '2021-02-20T00:00:00.000Z' AND pa.tenantCd = 'SAV' ORDER BY pa.lastSnapshotTs OFFSET 0 LIMIT 30}
{"traceId":"2d2182904426c76177c2ad84d3000000","logType":"ELF","fileName":"com.X.awm.common.data.BucketAccess","schemaVersion":"1.0.0","method":"queryFromPrimary","level":"INFO","message":"InsideQueryFromPrimary","type":"log","tags":{"baggage.Request-Id":"dbop.19feb.e0perf-1"},"spanId":"177c2ad84d3de500","environment":{"name":"e0","host":"XXX10S9HTD6","cluster":null},"application":{"name":"wminternal","namespace":"axp-wm-events-processing","version":null,"carId":600000579},"timestampInMillis":1613879061450,"logId":"5070ca61f41c38011c20177c2b02bca000000001","loggerName":"com.X.awm.common.data.BucketAccess","lineNumber":1345,"operation":"dbop consumer","timestamp":"2021-02-21T03:44:21.450Z"}
19:44:21.450Â [Â Â Â Â Â Â Â cb-computations-2] [INFO ] - [Â Â Â Â Â Â Â Â Â Â Â Â com.X.awm.common.data.BucketAccess:] -- [] -- InsideQueryFromPrimary
19:44:22.224Â [Â Â Â Â Â Â Â Â Â cb-tracing-1] [WARN ] - [Â Â Â com.couchbase.client.core.tracing.ThresholdLogReporter:] -- [] -- Operations over threshold: [ {
 "top" : [ {
  "operation_name" : "get_replica",
  "last_local_id" : "1E5598A52D593EF4/FFFFFFFFEAA91BDC",
  "last_local_address" : "10.xx.xx.xxx:56564",
  "last_remote_address" : "10.x.xxx.174:11210",
  "last_dispatch_us" : 37062,
  "decode_us" : 94,
  "last_operation_id" : "0x425",
  "total_us" : 38393
 } ],
 "service" : "kv",
 "count" : 1
} ]