求助!线上服务的nginx在对客户端提供服务的时候,会有千分之一的情况给 Connection Reset

188 views
Skip to first unread message

Freeman Zhang

unread,
Jul 19, 2013, 6:20:55 AM7/19/13
to open...@googlegroups.com
1. 机器配置
    64GB 内存
    8核 16线程的 志强处理器
    运行CentOS 6.3

2. nginx 承担的工作
    主要是提供反向代理工作
    其中有proxy_cache,量大概在600GB
    HIT命中率在60%左右
    平常提供QPS 200上下。

运行nginx版本:1.2.0

附:
. 内核参数

net.netfilter.nf_log.0 = NONE
net.netfilter.nf_log.1 = NONE
net.netfilter.nf_log.2 = NONE
net.netfilter.nf_log.3 = NONE
net.netfilter.nf_log.4 = NONE
net.netfilter.nf_log.5 = NONE
net.netfilter.nf_log.6 = NONE
net.netfilter.nf_log.7 = NONE
net.netfilter.nf_log.8 = NONE
net.netfilter.nf_log.9 = NONE
net.netfilter.nf_log.10 = NONE
net.netfilter.nf_log.11 = NONE
net.netfilter.nf_log.12 = NONE
net.core.somaxconn = 128
net.core.xfrm_aevent_etime = 10
net.core.xfrm_aevent_rseqth = 2
net.core.xfrm_larval_drop = 1
net.core.xfrm_acq_expires = 30
net.core.wmem_max = 1048576
net.core.rmem_max = 131071
net.core.wmem_default = 229376
net.core.rmem_default = 229376
net.core.dev_weight = 64
net.core.netdev_max_backlog = 1000
net.core.message_cost = 5
net.core.message_burst = 10
net.core.optmem_max = 20480
net.core.rps_sock_flow_entries = 0
net.core.netdev_budget = 300
net.core.warnings = 1
net.ipv4.route.gc_thresh = 524288
net.ipv4.route.max_size = 8388608
net.ipv4.route.gc_min_interval = 0
net.ipv4.route.gc_min_interval_ms = 500
net.ipv4.route.gc_timeout = 300
net.ipv4.route.gc_interval = 60
net.ipv4.route.redirect_load = 20
net.ipv4.route.redirect_number = 9
net.ipv4.route.redirect_silence = 20480
net.ipv4.route.error_cost = 1000
net.ipv4.route.error_burst = 5000
net.ipv4.route.gc_elasticity = 8
net.ipv4.route.mtu_expires = 600
net.ipv4.route.min_pmtu = 552
net.ipv4.route.min_adv_mss = 256
net.ipv4.route.secret_interval = 600
net.ipv4.neigh.default.mcast_solicit = 3
net.ipv4.neigh.default.ucast_solicit = 3
net.ipv4.neigh.default.app_solicit = 0
net.ipv4.neigh.default.retrans_time = 99
net.ipv4.neigh.default.base_reachable_time = 30
net.ipv4.neigh.default.delay_first_probe_time = 5
net.ipv4.neigh.default.gc_stale_time = 60
net.ipv4.neigh.default.unres_qlen = 3
net.ipv4.neigh.default.proxy_qlen = 64
net.ipv4.neigh.default.anycast_delay = 99
net.ipv4.neigh.default.proxy_delay = 79
net.ipv4.neigh.default.locktime = 99
net.ipv4.neigh.default.retrans_time_ms = 1000
net.ipv4.neigh.default.base_reachable_time_ms = 30000
net.ipv4.neigh.default.gc_interval = 30
net.ipv4.neigh.default.gc_thresh1 = 128
net.ipv4.neigh.default.gc_thresh2 = 512
net.ipv4.neigh.default.gc_thresh3 = 1024
net.ipv4.neigh.lo.mcast_solicit = 3
net.ipv4.neigh.lo.ucast_solicit = 3
net.ipv4.neigh.lo.app_solicit = 0
net.ipv4.neigh.lo.retrans_time = 99
net.ipv4.neigh.lo.base_reachable_time = 30
net.ipv4.neigh.lo.delay_first_probe_time = 5
net.ipv4.neigh.lo.gc_stale_time = 60
net.ipv4.neigh.lo.unres_qlen = 3
net.ipv4.neigh.lo.proxy_qlen = 64
net.ipv4.neigh.lo.anycast_delay = 99
net.ipv4.neigh.lo.proxy_delay = 79
net.ipv4.neigh.lo.locktime = 99
net.ipv4.neigh.lo.retrans_time_ms = 1000
net.ipv4.neigh.lo.base_reachable_time_ms = 30000
net.ipv4.neigh.em4.mcast_solicit = 3
net.ipv4.neigh.em4.ucast_solicit = 3
net.ipv4.neigh.em4.app_solicit = 0
net.ipv4.neigh.em4.retrans_time = 99
net.ipv4.neigh.em4.base_reachable_time = 30
net.ipv4.neigh.em4.delay_first_probe_time = 5
net.ipv4.neigh.em4.gc_stale_time = 60
net.ipv4.neigh.em4.unres_qlen = 3
net.ipv4.neigh.em4.proxy_qlen = 64
net.ipv4.neigh.em4.anycast_delay = 99
net.ipv4.neigh.em4.proxy_delay = 79
net.ipv4.neigh.em4.locktime = 99
net.ipv4.neigh.em4.retrans_time_ms = 1000
net.ipv4.neigh.em4.base_reachable_time_ms = 30000
net.ipv4.neigh.em2.mcast_solicit = 3
net.ipv4.neigh.em2.ucast_solicit = 3
net.ipv4.neigh.em2.app_solicit = 0
net.ipv4.neigh.em2.retrans_time = 99
net.ipv4.neigh.em2.base_reachable_time = 30
net.ipv4.neigh.em2.delay_first_probe_time = 5
net.ipv4.neigh.em2.gc_stale_time = 60
net.ipv4.neigh.em2.unres_qlen = 3
net.ipv4.neigh.em2.proxy_qlen = 64
net.ipv4.neigh.em2.anycast_delay = 99
net.ipv4.neigh.em2.proxy_delay = 79
net.ipv4.neigh.em2.locktime = 99
net.ipv4.neigh.em2.retrans_time_ms = 1000
net.ipv4.neigh.em2.base_reachable_time_ms = 30000
net.ipv4.neigh.em3.mcast_solicit = 3
net.ipv4.neigh.em3.ucast_solicit = 3
net.ipv4.neigh.em3.app_solicit = 0
net.ipv4.neigh.em3.retrans_time = 99
net.ipv4.neigh.em3.base_reachable_time = 30
net.ipv4.neigh.em3.delay_first_probe_time = 5
net.ipv4.neigh.em3.gc_stale_time = 60
net.ipv4.neigh.em3.unres_qlen = 3
net.ipv4.neigh.em3.proxy_qlen = 64
net.ipv4.neigh.em3.anycast_delay = 99
net.ipv4.neigh.em3.proxy_delay = 79
net.ipv4.neigh.em3.locktime = 99
net.ipv4.neigh.em3.retrans_time_ms = 1000
net.ipv4.neigh.em3.base_reachable_time_ms = 30000
net.ipv4.neigh.em1.mcast_solicit = 3
net.ipv4.neigh.em1.ucast_solicit = 3
net.ipv4.neigh.em1.app_solicit = 0
net.ipv4.neigh.em1.retrans_time = 99
net.ipv4.neigh.em1.base_reachable_time = 30
net.ipv4.neigh.em1.delay_first_probe_time = 5
net.ipv4.neigh.em1.gc_stale_time = 60
net.ipv4.neigh.em1.unres_qlen = 3
net.ipv4.neigh.em1.proxy_qlen = 64
net.ipv4.neigh.em1.anycast_delay = 99
net.ipv4.neigh.em1.proxy_delay = 79
net.ipv4.neigh.em1.locktime = 99
net.ipv4.neigh.em1.retrans_time_ms = 1000
net.ipv4.neigh.em1.base_reachable_time_ms = 30000
net.ipv4.tcp_timestamps = 1
net.ipv4.tcp_window_scaling = 1
net.ipv4.tcp_sack = 1
net.ipv4.tcp_retrans_collapse = 1
net.ipv4.ip_default_ttl = 64
net.ipv4.ip_no_pmtu_disc = 0
net.ipv4.ip_nonlocal_bind = 1
net.ipv4.tcp_syn_retries = 5
net.ipv4.tcp_synack_retries = 3
net.ipv4.tcp_max_orphans = 3276800
net.ipv4.tcp_max_tw_buckets = 5000
net.ipv4.ip_dynaddr = 0
net.ipv4.tcp_keepalive_time = 300
net.ipv4.tcp_keepalive_probes = 9
net.ipv4.tcp_keepalive_intvl = 75
net.ipv4.tcp_retries1 = 3
net.ipv4.tcp_retries2 = 15
net.ipv4.tcp_fin_timeout = 15
net.ipv4.tcp_syncookies = 0
net.ipv4.tcp_tw_recycle = 0
net.ipv4.tcp_abort_on_overflow = 0
net.ipv4.tcp_stdurg = 0
net.ipv4.tcp_rfc1337 = 0
net.ipv4.tcp_max_syn_backlog = 65536
net.ipv4.ip_local_port_range = 5000    65000
net.ipv4.ip_local_reserved_ports =
net.ipv4.igmp_max_memberships = 20
net.ipv4.igmp_max_msf = 10
net.ipv4.inet_peer_threshold = 65664
net.ipv4.inet_peer_minttl = 120
net.ipv4.inet_peer_maxttl = 600
net.ipv4.inet_peer_gc_mintime = 10
net.ipv4.inet_peer_gc_maxtime = 120
net.ipv4.tcp_orphan_retries = 0
net.ipv4.tcp_fack = 1
net.ipv4.tcp_reordering = 3
net.ipv4.tcp_ecn = 2
net.ipv4.tcp_dsack = 1
net.ipv4.tcp_mem = 94500000    915000000    927000000
net.ipv4.tcp_wmem = 4096    16384    4194304
net.ipv4.tcp_rmem = 4096    87380    4194304
net.ipv4.tcp_app_win = 31
net.ipv4.tcp_adv_win_scale = 2
net.ipv4.tcp_tw_reuse = 1
net.ipv4.tcp_frto = 2
net.ipv4.tcp_frto_response = 0
net.ipv4.tcp_low_latency = 0
net.ipv4.tcp_no_metrics_save = 0
net.ipv4.tcp_moderate_rcvbuf = 1
net.ipv4.tcp_tso_win_divisor = 3
net.ipv4.tcp_congestion_control = cubic
net.ipv4.tcp_abc = 0
net.ipv4.tcp_mtu_probing = 0
net.ipv4.tcp_base_mss = 512
net.ipv4.tcp_workaround_signed_windows = 0
net.ipv4.tcp_dma_copybreak = 4096
net.ipv4.tcp_slow_start_after_idle = 1
net.ipv4.cipso_cache_enable = 1
net.ipv4.cipso_cache_bucket_size = 10
net.ipv4.cipso_rbm_optfmt = 0
net.ipv4.cipso_rbm_strictvalid = 1
net.ipv4.tcp_available_congestion_control = cubic reno
net.ipv4.tcp_allowed_congestion_control = cubic reno
net.ipv4.tcp_max_ssthresh = 0
net.ipv4.tcp_thin_linear_timeouts = 0
net.ipv4.tcp_thin_dupack = 0
net.ipv4.udp_mem = 6181536    8242048    12363072
net.ipv4.udp_rmem_min = 4096
net.ipv4.udp_wmem_min = 4096
net.ipv4.conf.all.forwarding = 1
net.ipv4.conf.all.mc_forwarding = 0
net.ipv4.conf.all.accept_redirects = 0
net.ipv4.conf.all.secure_redirects = 1
net.ipv4.conf.all.shared_media = 1
net.ipv4.conf.all.rp_filter = 2
net.ipv4.conf.all.send_redirects = 0
net.ipv4.conf.all.accept_source_route = 0
net.ipv4.conf.all.src_valid_mark = 0
net.ipv4.conf.all.proxy_arp = 0
net.ipv4.conf.all.medium_id = 0
net.ipv4.conf.all.bootp_relay = 0
net.ipv4.conf.all.log_martians = 0
net.ipv4.conf.all.tag = 0
net.ipv4.conf.all.arp_filter = 0
net.ipv4.conf.all.arp_announce = 2
net.ipv4.conf.all.arp_ignore = 2
net.ipv4.conf.all.arp_accept = 0
net.ipv4.conf.all.arp_notify = 0
net.ipv4.conf.all.proxy_arp_pvlan = 0
net.ipv4.conf.all.disable_xfrm = 0
net.ipv4.conf.all.disable_policy = 0
net.ipv4.conf.all.force_igmp_version = 0
net.ipv4.conf.all.promote_secondaries = 0
net.ipv4.conf.all.accept_local = 0
net.ipv4.conf.default.forwarding = 1
net.ipv4.conf.default.mc_forwarding = 0
net.ipv4.conf.default.accept_redirects = 1
net.ipv4.conf.default.secure_redirects = 1
net.ipv4.conf.default.shared_media = 1
net.ipv4.conf.default.rp_filter = 2
net.ipv4.conf.default.send_redirects = 1
net.ipv4.conf.default.accept_source_route = 0
net.ipv4.conf.default.src_valid_mark = 0
net.ipv4.conf.default.proxy_arp = 0
net.ipv4.conf.default.medium_id = 0
net.ipv4.conf.default.bootp_relay = 0
net.ipv4.conf.default.log_martians = 0
net.ipv4.conf.default.tag = 0
net.ipv4.conf.default.arp_filter = 0
net.ipv4.conf.default.arp_announce = 0
net.ipv4.conf.default.arp_ignore = 0
net.ipv4.conf.default.arp_accept = 0
net.ipv4.conf.default.arp_notify = 0
net.ipv4.conf.default.proxy_arp_pvlan = 0
net.ipv4.conf.default.disable_xfrm = 0
net.ipv4.conf.default.disable_policy = 0
net.ipv4.conf.default.force_igmp_version = 0
net.ipv4.conf.default.promote_secondaries = 0
net.ipv4.conf.default.accept_local = 0
net.ipv4.conf.lo.forwarding = 1
net.ipv4.conf.lo.mc_forwarding = 0
net.ipv4.conf.lo.accept_redirects = 1
net.ipv4.conf.lo.secure_redirects = 1
net.ipv4.conf.lo.shared_media = 1
net.ipv4.conf.lo.rp_filter = 0
net.ipv4.conf.lo.send_redirects = 1
net.ipv4.conf.lo.accept_source_route = 0
net.ipv4.conf.lo.src_valid_mark = 0
net.ipv4.conf.lo.proxy_arp = 0
net.ipv4.conf.lo.medium_id = 0
net.ipv4.conf.lo.bootp_relay = 0
net.ipv4.conf.lo.log_martians = 0
net.ipv4.conf.lo.tag = 0
net.ipv4.conf.lo.arp_filter = 0
net.ipv4.conf.lo.arp_announce = 0
net.ipv4.conf.lo.arp_ignore = 0
net.ipv4.conf.lo.arp_accept = 0
net.ipv4.conf.lo.arp_notify = 0
net.ipv4.conf.lo.proxy_arp_pvlan = 0
net.ipv4.conf.lo.disable_xfrm = 1
net.ipv4.conf.lo.disable_policy = 1
net.ipv4.conf.lo.force_igmp_version = 0
net.ipv4.conf.lo.promote_secondaries = 0
net.ipv4.conf.lo.accept_local = 0
net.ipv4.conf.em4.forwarding = 1
net.ipv4.conf.em4.mc_forwarding = 0
net.ipv4.conf.em4.accept_redirects = 1
net.ipv4.conf.em4.secure_redirects = 1
net.ipv4.conf.em4.shared_media = 1
net.ipv4.conf.em4.rp_filter = 2
net.ipv4.conf.em4.send_redirects = 1
net.ipv4.conf.em4.accept_source_route = 0
net.ipv4.conf.em4.src_valid_mark = 0
net.ipv4.conf.em4.proxy_arp = 0
net.ipv4.conf.em4.medium_id = 0
net.ipv4.conf.em4.bootp_relay = 0
net.ipv4.conf.em4.log_martians = 0
net.ipv4.conf.em4.tag = 0
net.ipv4.conf.em4.arp_filter = 0
net.ipv4.conf.em4.arp_announce = 0
net.ipv4.conf.em4.arp_ignore = 0
net.ipv4.conf.em4.arp_accept = 0
net.ipv4.conf.em4.arp_notify = 0
net.ipv4.conf.em4.proxy_arp_pvlan = 0
net.ipv4.conf.em4.disable_xfrm = 0
net.ipv4.conf.em4.disable_policy = 0
net.ipv4.conf.em4.force_igmp_version = 0
net.ipv4.conf.em4.promote_secondaries = 0
net.ipv4.conf.em4.accept_local = 0
net.ipv4.conf.em2.forwarding = 1
net.ipv4.conf.em2.mc_forwarding = 0
net.ipv4.conf.em2.accept_redirects = 1
net.ipv4.conf.em2.secure_redirects = 1
net.ipv4.conf.em2.shared_media = 1
net.ipv4.conf.em2.rp_filter = 2
net.ipv4.conf.em2.send_redirects = 1
net.ipv4.conf.em2.accept_source_route = 0
net.ipv4.conf.em2.src_valid_mark = 0
net.ipv4.conf.em2.proxy_arp = 0
net.ipv4.conf.em2.medium_id = 0
net.ipv4.conf.em2.bootp_relay = 0
net.ipv4.conf.em2.log_martians = 0
net.ipv4.conf.em2.tag = 0
net.ipv4.conf.em2.arp_filter = 0
net.ipv4.conf.em2.arp_announce = 0
net.ipv4.conf.em2.arp_ignore = 0
net.ipv4.conf.em2.arp_accept = 0
net.ipv4.conf.em2.arp_notify = 0
net.ipv4.conf.em2.proxy_arp_pvlan = 0
net.ipv4.conf.em2.disable_xfrm = 0
net.ipv4.conf.em2.disable_policy = 0
net.ipv4.conf.em2.force_igmp_version = 0
net.ipv4.conf.em2.promote_secondaries = 0
net.ipv4.conf.em2.accept_local = 0
net.ipv4.conf.em3.forwarding = 1
net.ipv4.conf.em3.mc_forwarding = 0
net.ipv4.conf.em3.accept_redirects = 1
net.ipv4.conf.em3.secure_redirects = 1
net.ipv4.conf.em3.shared_media = 1
net.ipv4.conf.em3.rp_filter = 2
net.ipv4.conf.em3.send_redirects = 1
net.ipv4.conf.em3.accept_source_route = 0
net.ipv4.conf.em3.src_valid_mark = 0
net.ipv4.conf.em3.proxy_arp = 0
net.ipv4.conf.em3.medium_id = 0
net.ipv4.conf.em3.bootp_relay = 0
net.ipv4.conf.em3.log_martians = 0
net.ipv4.conf.em3.tag = 0
net.ipv4.conf.em3.arp_filter = 0
net.ipv4.conf.em3.arp_announce = 0
net.ipv4.conf.em3.arp_ignore = 0
net.ipv4.conf.em3.arp_accept = 0
net.ipv4.conf.em3.arp_notify = 0
net.ipv4.conf.em3.proxy_arp_pvlan = 0
net.ipv4.conf.em3.disable_xfrm = 0
net.ipv4.conf.em3.disable_policy = 0
net.ipv4.conf.em3.force_igmp_version = 0
net.ipv4.conf.em3.promote_secondaries = 0
net.ipv4.conf.em3.accept_local = 0
net.ipv4.conf.em1.forwarding = 1
net.ipv4.conf.em1.mc_forwarding = 0
net.ipv4.conf.em1.accept_redirects = 1
net.ipv4.conf.em1.secure_redirects = 1
net.ipv4.conf.em1.shared_media = 1
net.ipv4.conf.em1.rp_filter = 2
net.ipv4.conf.em1.send_redirects = 1
net.ipv4.conf.em1.accept_source_route = 0
net.ipv4.conf.em1.src_valid_mark = 0
net.ipv4.conf.em1.proxy_arp = 0
net.ipv4.conf.em1.medium_id = 0
net.ipv4.conf.em1.bootp_relay = 0
net.ipv4.conf.em1.log_martians = 0
net.ipv4.conf.em1.tag = 0
net.ipv4.conf.em1.arp_filter = 0
net.ipv4.conf.em1.arp_announce = 0
net.ipv4.conf.em1.arp_ignore = 0
net.ipv4.conf.em1.arp_accept = 0
net.ipv4.conf.em1.arp_notify = 0
net.ipv4.conf.em1.proxy_arp_pvlan = 0
net.ipv4.conf.em1.disable_xfrm = 0
net.ipv4.conf.em1.disable_policy = 0
net.ipv4.conf.em1.force_igmp_version = 0
net.ipv4.conf.em1.promote_secondaries = 0
net.ipv4.conf.em1.accept_local = 0
net.ipv4.ip_forward = 1
net.ipv4.xfrm4_gc_thresh = 4194304
net.ipv4.ipfrag_high_thresh = 262144
net.ipv4.ipfrag_low_thresh = 196608
net.ipv4.ipfrag_time = 30
net.ipv4.icmp_echo_ignore_all = 0
net.ipv4.icmp_echo_ignore_broadcasts = 1
net.ipv4.icmp_ignore_bogus_error_responses = 1
net.ipv4.icmp_errors_use_inbound_ifaddr = 0
net.ipv4.icmp_ratelimit = 300
net.ipv4.icmp_ratemask = 6168
net.ipv4.rt_cache_rebuild_count = 4
net.ipv4.ipfrag_secret_interval = 600
net.ipv4.ipfrag_max_dist = 64

Yichun Zhang (agentzh)

unread,
Jul 19, 2013, 2:56:53 PM7/19/13
to openresty
Hello!

On Fri, Jul 19, 2013 at 3:20 AM, Freeman Zhang wrote:
> 1. 机器配置
> 64GB 内存
> 8核 16线程的 志强处理器
> 运行CentOS 6.3
>
> 2. nginx 承担的工作
> 主要是提供反向代理工作
> 其中有proxy_cache,量大概在600GB
> HIT命中率在60%左右
> 平常提供QPS 200上下。
>

你能在 TCP 链路上确定到底是谁发起的 RST 包么?

我听说微软的浏览器喜欢主动发起 RST 包来断开请求,我还听说有一些防火墙性质的东西也喜欢发起 RST 包。

你提供的信息过少,所以很难做出有意义的判断。

Regards,
-agentzh

Yichun Zhang (agentzh)

unread,
Jul 20, 2013, 1:59:51 AM7/20/13
to Freeman Zhang, openresty
Hello!

2013/7/19 Freeman Zhang:
> 可以确认是我方服务器发起的connection reset
> 从tcpdump的抓包上看,RST ACK确实是服务器的IP和nginx服务端口。
>

进一步追踪问题我觉得有两个方向,不过都在很低的层面上:

1. 同时记录 RST,ACK 包发送之前当前 TCP 连接上两个方向上往来的包信息。从这些包的内容,结合 TCP 协议的要求,推测引发
kernel TCP 协议栈实现返回 RST,ACK 包的原因。

2. 通过 systemtap 这样的动态追踪工具,直接探入 kernel TCP 协议栈实现中所有发送 RST 包的位置,通过
kernel backtrace 信息(以及 userspace backtrace)确认发起 RST 的代码位置,从而确认原因。

第二个方向更直接一些,运行时代价或许也更低一些,毕竟所你所说,RST 是小频率事件。

Regards,
-agentzh

P.S. 同时抄送 openresty 中文邮件列表。

Freeman Zhang

unread,
Jul 20, 2013, 11:43:28 AM7/20/13
to Yichun Zhang (agentzh), openresty
直觉上,这个问题很可能跟Time Wait的回收有关。
一个可能性是,回收time wait的时候kernel主动发送了rst ack,但我没有找到有关文档描述这个事情。

Simon

unread,
Jul 21, 2013, 9:32:21 PM7/21/13
to open...@googlegroups.com
会不会是监听队列溢出了,nginx的进程有多少个,IO繁忙吗?
Reply all
Reply to author
Forward
0 new messages