Prometheus metrics disappear after pods being restart by K8s health probe

442 views
Skip to first unread message

Kenny Tung

unread,
Oct 13, 2023, 5:17:39 AM10/13/23
to Prometheus Users
Any idea how to fix it?

ts=2023-10-13T08:38:14.786Z caller=main.go:583 level=info msg="Starting Prometheus Server" mode=server version="(version=2.47.1, branch=HEAD, revision=c4d1a8beff37cc004f1dc4ab9d2e73193f51aaeb)"
ts=2023-10-13T08:38:14.786Z caller=main.go:588 level=info build_context="(go=go1.21.1, platform=linux/amd64, user=root@4829330363be, date=20231004-10:31:16, tags=netgo,builtinassets,stringlabels)"
ts=2023-10-13T08:38:14.786Z caller=main.go:589 level=info host_details="(Linux 5.15.0-1041-azure #48-Ubuntu SMP Tue Jun 20 20:34:08 UTC 2023 x86_64 prometheus-5765644cf4-dczhv (none))"
ts=2023-10-13T08:38:14.786Z caller=main.go:590 level=info fd_limits="(soft=1048576, hard=1048576)"
ts=2023-10-13T08:38:14.786Z caller=main.go:591 level=info vm_limits="(soft=unlimited, hard=unlimited)"
ts=2023-10-13T08:38:14.825Z caller=web.go:566 level=info component=web msg="Start listening for connections" address=0.0.0.0:9290
ts=2023-10-13T08:38:14.826Z caller=main.go:1024 level=info msg="Starting TSDB ..."
ts=2023-10-13T08:38:14.827Z caller=tls_config.go:274 level=info component=web msg="Listening on" address=[::]:9290
ts=2023-10-13T08:38:14.827Z caller=tls_config.go:310 level=info component=web msg="TLS is enabled." http2=true address=[::]:9290

ts=2023-10-13T08:38:14.940Z caller=db.go:930 level=info component=tsdb msg="Found and deleted tmp block dir" dir=data/01HCM2ZA20JWZ3SSB4MCY7KWJB.tmp-for-creation
ts=2023-10-13T08:38:14.945Z caller=dir_locker.go:77 level=warn component=tsdb msg="A lockfile from a previous execution already existed. It was replaced" file=/prometheus/data/lock
ts=2023-10-13T08:38:14.985Z caller=head.go:600 level=info component=tsdb msg="Replaying on-disk memory mappable chunks if any"
ts=2023-10-13T08:38:14.986Z caller=head.go:666 level=error component=tsdb msg="Loading on-disk chunks failed" err="iterate on on-disk chunks: corruption in head chunk file data/chunks_head/000001: head chunk file has some unread data, but doesn't include enough bytes to read the chunk header - required:131078, available:131072, file:1"
ts=2023-10-13T08:38:14.986Z caller=head.go:906 level=info component=tsdb msg="Deleting mmapped chunk files"
ts=2023-10-13T08:38:14.988Z caller=head.go:912 level=info component=tsdb msg="Deleting mmapped chunk files"
ts=2023-10-13T08:38:15.000Z caller=head.go:922 level=info component=tsdb msg="Deletion of mmap chunk files successful, reattempting m-mapping the on-disk chunks"
ts=2023-10-13T08:38:15.000Z caller=head.go:681 level=info component=tsdb msg="On-disk memory mappable chunks replay completed" duration=14.995821ms
ts=2023-10-13T08:38:15.000Z caller=head.go:689 level=info component=tsdb msg="Replaying WAL, this may take a while"
ts=2023-10-13T08:38:15.251Z caller=db.go:898 level=warn component=tsdb msg="Encountered WAL read error, attempting repair" err="read records: corruption in segment data/wal/00000000 at 1212416: unexpected non-zero byte in padded page"
ts=2023-10-13T08:38:15.252Z caller=wlog.go:426 level=warn component=tsdb msg="Starting corruption repair" segment=0 offset=1212416
ts=2023-10-13T08:38:15.259Z caller=wlog.go:434 level=warn component=tsdb msg="Deleting all segments newer than corrupted segment" segment=0
ts=2023-10-13T08:38:15.279Z caller=wlog.go:456 level=warn component=tsdb msg="Rewrite corrupted segment" segment=0
ts=2023-10-13T08:38:23.386Z caller=db.go:902 level=info component=tsdb msg="Successfully repaired WAL"
ts=2023-10-13T08:38:23.391Z caller=main.go:1045 level=info fs_type=fe534d42
ts=2023-10-13T08:38:23.391Z caller=main.go:1048 level=info msg="TSDB started"


ts=2023-10-13T08:38:43.281Z caller=compact.go:562 level=error component=tsdb msg="removed tmp folder after failed compaction" err="unlinkat data/01HCM306H4R5JVAP85K42A11C7.tmp-for-creation: directory not empty"
ts=2023-10-13T08:38:43.281Z caller=db.go:984 level=error component=tsdb msg="compaction failed" err="compact head: persist head block: 2 errors: populate block: add series: read symbols: invalid checksum; read symbols: invalid checksum"
ts=2023-10-13T08:38:44.734Z caller=compact.go:562 level=error component=tsdb msg="removed tmp folder after failed compaction" err="unlinkat data/01HCM307SSQPBMGTYZ4VK7D117.tmp-for-creation: directory not empty"
ts=2023-10-13T08:38:44.734Z caller=db.go:984 level=error component=tsdb msg="compaction failed" err="compact head: persist head block: 2 errors: populate block: add series: read symbols: invalid checksum; read symbols: invalid checksum"
ts=2023-10-13T08:38:47.040Z caller=compact.go:562 level=error component=tsdb msg="removed tmp folder after failed compaction" err="unlinkat data/01HCM30A6F8GEXNYT4HJC8TN88.tmp-for-creation: directory not empty"
ts=2023-10-13T08:38:47.040Z caller=db.go:984 level=error component=tsdb msg="compaction failed" err="compact head: persist head block: 2 errors: populate block: add series: read symbols: invalid checksum; read symbols: invalid checksum"
ts=2023-10-13T08:38:51.483Z caller=compact.go:562 level=error component=tsdb msg="removed tmp folder after failed compaction" err="unlinkat data/01HCM30ED1P7AKQJ7M6A9D5GR2.tmp-for-creation: directory not empty"
ts=2023-10-13T08:38:51.483Z caller=db.go:984 level=error component=tsdb msg="compaction failed" err="compact head: persist head block: 2 errors: populate block: add series: read symbols: invalid checksum; read symbols: invalid checksum"
ts=2023-10-13T08:38:59.826Z caller=compact.go:562 level=error component=tsdb msg="removed tmp folder after failed compaction" err="unlinkat data/01HCM30PMVZ1AJJ03HQYYNX6RJ.tmp-for-creation: directory not empty"
ts=2023-10-13T08:38:59.827Z caller=db.go:984 level=error component=tsdb msg="compaction failed" err="compact head: persist head block: 2 errors: populate block: add series: read symbols: invalid checksum; read symbols: invalid checksum"
ts=2023-10-13T08:39:13.910Z caller=stdlib.go:105 level=error component=web caller="http: TLS handshake error from 10.104.60.63:46062" msg=EOF

# kubectl get pods -n grafana
NAME                           READY   STATUS    RESTARTS      AGE
grafana-5d5c88677b-wvkhm       1/1     Running   0             75m
influxdb-df6b77f54-hzzs9       1/1     Running   0             29d
prometheus-5765644cf4-dczhv    1/1     Running   1 (11m ago)   166m


$ ls -tlr
total 20
-rwxrwxrwx 1 root root     0 Oct 13 14:02 lock
drwxrwxrwx 2 root root     0 Oct 13 14:02 wal
drwxrwxrwx 2 root root     0 Oct 13 14:02 chunks_head
-rwxrwxrwx 1 root root 20001 Oct 13 16:38 queries.active
drwxrwxrwx 2 root root     0 Oct 13 16:38 01HCM306H4R5JVAP85K42A11C7.tmp-for-creation
drwxrwxrwx 2 root root     0 Oct 13 16:38 01HCM307SSQPBMGTYZ4VK7D117.tmp-for-creation
drwxrwxrwx 2 root root     0 Oct 13 16:38 01HCM30A6F8GEXNYT4HJC8TN88.tmp-for-creation
drwxrwxrwx 2 root root     0 Oct 13 16:38 01HCM30ED1P7AKQJ7M6A9D5GR2.tmp-for-creation
drwxrwxrwx 2 root root     0 Oct 13 16:38 01HCM30PMVZ1AJJ03HQYYNX6RJ.tmp-for-creation
drwxrwxrwx 2 root root     0 Oct 13 16:39 01HCM316KKGX4ZC2SJV2DPD4YQ.tmp-for-creation
drwxrwxrwx 2 root root     0 Oct 13 16:39 01HCM3266SHYDKQEYR9BH3K2NK.tmp-for-creation
drwxrwxrwx 2 root root     0 Oct 13 16:40 01HCM3415EK8FCQ5DW3SFVE6HE.tmp-for-creation
drwxrwxrwx 2 root root     0 Oct 13 16:41 01HCM35W4GP5R0GBDYSMQM3VP0.tmp-for-creation
drwxrwxrwx 2 root root     0 Oct 13 16:42 01HCM37Q2W4SY54V7Y398718TJ.tmp-for-creation
drwxrwxrwx 2 root root     0 Oct 13 16:43 01HCM39J1BW8QEGV3W2M89YGSK.tmp-for-creation
drwxrwxrwx 2 root root     0 Oct 13 16:44 01HCM3BCZEJ9B56XYJZJTME4PK.tmp-for-creation
drwxrwxrwx 2 root root     0 Oct 13 16:45 01HCM3D7Y3E7AY4Z9XF6HT4WVZ.tmp-for-creation
drwxrwxrwx 2 root root     0 Oct 13 16:46 01HCM3F2YB9B8C6DGV7T9VXGED.tmp-for-creation
drwxrwxrwx 2 root root     0 Oct 13 16:47 01HCM3GXZ1F13VVZQKCHM696EA.tmp-for-creation
Reply all
Reply to author
Forward
0 new messages