worker_processes 1; #
worker_rlimit_core 10000m; # for producing core file
lua_shared_dict test_lock_flaw 10m;
location /flaw{
content_by_lua '
local flaw = require"flaw"
flaw.test_lock_flaw()
';
}
location /is_alive{
content_by_lua '
ngx.say("not dead lock yet :)")
';
}
#
while true
do
ab -c 10 -n 100000 127.0.0.1/flaw
sleep 1
done
while true
do
ab -c 10 -n 100000 127.0.0.1/is_alive
sleep 1
done
-- yunth...@gmail.com 25-Sep-16
--
local _M = { _VERSION = '1.2' }
-- debug flag
local ddd_flag = false
local lib_ffi = require'ffi'
local lib_lock = require"resty.lock"
local lib_C = lib_ffi.C
local ffi = lib_ffi
local function anotnil(x)
assert(type(x)~="nil")
end
anotnil(lib_ffi)
anotnil(lib_lock)
local function traceback_str()
if ddd_flag == true then
local striped_body,_=string.gsub(debug.traceback(),"","")
return(tostring(striped_body))
else
return nil
end
return
--local striped_body,_=string.gsub(debug.traceback(),"","")
--return(tostring(striped_body))
end
local function notnil(x)
return type(x)~="nil"
end
local function isnil(x)
return type(x)=="nil"
end
local function typet(t)
return type(t)=="table"
end
local function types(s)
return type(s)=="string"
end
local function typen(n)
return type(n)=="number"
end
local function typef(f)
return type(f) == "function"
end
local function typecdata(c)
return type(c) == "cdata"
end
local function anotnil(x)
local str = traceback_str()
assert(type(x)~="nil",str)
end
local function anil(x)
local str = traceback_str()
assert(type(x)=="nil",str)
end
local function atypet(t)
local str = traceback_str()
assert(type(t)=="table",str)
end
local function atypes(s)
local str = traceback_str()
assert(type(s)=="string",str)
end
local function atypen(n)
local str = traceback_str()
assert(type(n)=="number",str)
end
local function atypef(f)
local str = traceback_str()
assert(type(f)=="function",str)
end
local function acdata(p)
local str = traceback_str()
assert(type(p)=="cdata",str)
end
local function acdata_notnil(p)
local str = traceback_str()
assert(type(p)=="cdata",str)
assert(p ~= nil,str)
end
local function csizeof(str)
atypes(str)
local sz = ffi.sizeof(str)
atypen(sz)
return sz
end
local function ccast(ptr,str)
--assert(type(ptr) == "cdata")
--assert(ptr ~= nil)
atypes(str)
local p = ffi.cast(str,ptr)
--assert(type(p) == "cdata")
--assert(p ~= nil)
return p
end
atypet(ngx)
local function shd_set_safe(shd_name_str,member_name_str,value)
--return true or nil,err_str
local dic=ngx.shared[shd_name_str]
local success,err=dic:safe_set(member_name_str,value)
if type(success)~="boolean" or success~=true then
return nil,tostring(err)
else
return success
end
end
local function lock_try(shd_name,lock_name,expire) -- return object lock or nil
-- new a try lock
-- auto expire:20s
if type(expire)~="number" then
expire=20
end
assert(type(lock_name)=="string")
assert(#lock_name>0)
atypes(shd_name)
local lock = lib_lock:new(shd_name,
{["exptime"]=expire,["timeout"]=0,["step"]=0.001,["ratio"]=2,["max_step"]=0.5})
-- try_lock
local elapsed, err = lock:lock(lock_name)
if type(elapsed) ~= "nil" then
-- got the lock :)
-- some stuff :)
return lock
-- release the lock
-- lock:unlock()
end
return nil
end
local function lock_release(lock)
lock:unlock()
end
local function shd_set_safe_with_assert(shd,member,value)
assert(type(value)~="nil")
assert(type(shd)=="string")
assert(#shd>0)
assert(type(member)=="string")
assert(#member>0)
local ret=shd_set_safe(shd,member,value)
assert(ret==true)
return ret
end
function _M.test_lock_flaw_prerequisite(pf)
local ffi_new = ffi.new
if not typef(pf) then
pf = ngx.say
end
local function gc_fp(cdata)
acdata_notnil(cdata)
-- do gc stuff...
pf("gc_fp:"..tostring(cdata))
end
local ctype = ffi.metatype(
"struct {int key;}",
{ __gc = gc_fp }
)
anotnil(ctype)
pf("type(ctype):"..type(ctype))
cdata1 = ffi_new(ctype)
acdata_notnil(cdata1)
cdata1.key = 1
--
cdata1 = nil
pf("collectgarbage:(are you see some gc outputs?...☺")
collectgarbage()
pf("exit,bye :)")
end
function _M.test_lock_flaw(pf)
local ffi_new = ffi.new
if not typef(pf) then
pf = ngx.say
end
local function xpf(str)
atypes(str)
pf(str)
if pf == ngx.say then
ngx.flush()
end
end
xpf("enter:")
local shd_name = "test_lock_flaw"
local lock_ct = 0
local lock_t = {}
local dic = ngx.shared[shd_name]
shd_set_safe_with_assert(
shd_name,
"key:"..tostring(key_ct),
string.rep("v:"..tostring(key_ct),1)
)
while lock_ct < (10000000) do
local lock = lock_try(shd_name,"lock_name"..tostring(math.random()),1)
lock = nil
dic:get("key:0")
lock_ct = lock_ct + 1
end
local key_ct = 0
shd_set_safe_with_assert(
shd_name,
"key:"..tostring(key_ct),
string.rep("v:"..tostring(key_ct),1)
)
local dic = ngx.shared[shd_name]
local loop_ct = 0
while loop_ct < (math.random(1,10000)*100) do
loop_ct = loop_ct + 1
dic:get("key:0")
lock_t = nil
end
xpf("exit,bye :)")
end
function _M.test_lock_flaw_rand(pf)
local ffi_new = ffi.new
if not typef(pf) then
pf = ngx.say
end
local function xpf(str)
atypes(str)
pf(str)
if pf == ngx.say then
ngx.flush()
end
end
xpf("enter:")
local shd_name = "test_lock_flaw"
local lock_ct = 0
local lock_t = {}
local pid = ngx.pid()
while lock_ct < 10000 do
lock_t[lock_ct] = lock_try(shd_name,"lock_name:"..tostring(lock_ct)..":"..tostring(pid)..":"..tostring(math.random()),1)
anotnil(lock_t[lock_ct])
lock_ct = lock_ct + 1
end
local key_ct = 0
shd_set_safe_with_assert(
shd_name,
"key:"..tostring(key_ct),
string.rep("v:"..tostring(key_ct),1000)
)
local v = shd_get_safe_with_assert(
shd_name,
"key:"..tostring(key_ct)
)
assert(v == string.rep("v:"..tostring(key_ct),1000))
local dic = ngx.shared[shd_name]
local loop_ct = 0
while loop_ct < 20000000 do
loop_ct = loop_ct + 1
dic:get("key:0")
lock_t = nil
end
xpf("exit,bye :)")
end
return _M
#lua c接口类型的 shdict apingx_shmtx_lock(&ctx->shpool->mutex);rc = ngx_http_lua_shdict_lookup(zone, hash, key.data, key.len, &sd);if(value_type == LUA_TSTRING){lua_pushlstring(L, (char *) value.data, value.len);ngx_shmtx_unlock(&ctx->shpool->mutex);}
一个worker的luajit vm垃圾回收调用了shdict访问操作,接着这个shdict访问操作去获取一把它已经获得的锁,于是,自己死锁。接着,大多数情况下,其他worker的逻辑中也会存在shdict访问(因为shdict本身是用来进行跨worker进程间通信的),于是在接下来一个确定的时间内(取决于程序的固定参数,一般都比较小,秒级别),其他的所有worker一样陷入死锁状态,因为这把锁的拥有者在阻塞地获取这把它已经抢到的锁。
#ffi接口类型的 shdict apingx_shmtx_lock(&ctx->shpool->mutex);rc = ngx_http_lua_shdict_lookup(zone, hash, key.data, key.len, &sd);if(value_type == LUA_TSTRING){# copy value str to usr's bufferngx_shmtx_unlock(&ctx->shpool->mutex);# return buffer address to user}
init_by_lua 'require"resty.core"';
collectgarbage("stop")--any dict operationcollectgarbage("restart")
凡是使用lua c类型shdict接口的,并且在lua对象的__gc方法中进行了lua shdict操作的,就有一定概率发生worker死锁。
凡是使用了lua-resty-lock库并且没有事先使用lua-resty-core进行api替换的Openresty服务,都会有概率死锁。
init_by_lua 'require"resty.core"';
--
--
邮件来自列表“openresty”,专用于技术讨论!
订阅: 请发空白邮件到 openresty+subscribe@googlegroups.com
发言: 请发邮件到 open...@googlegroups.com
退订: 请发邮件至 openresty+unsubscribe@googlegroups.com
归档: http://groups.google.com/group/openresty
官网: http://openresty.org/
仓库: https://github.com/agentzh/ngx_openresty
教程: http://openresty.org/download/agentzh-nginx-tutorials-zhcn.html
worker_processes 1; #worker_rlimit_core 10000m; # for producing core filelua_shared_dict test_lock_flaw 10m;
location flaw_simple {content_by_lua 'require "flaw_simple"
';}location /is_alive{content_by_lua 'ngx.say("not dead lock yet :)")';}
local lib_lock = require"resty.lock" -- 0.04local function lock_try(shd_name,lock_name,expire)if type(expire) ~= "number" thenexpire = 20endassert(type(lock_name) == "string")assert(#lock_name > 0)assert(type(shd_name) == "string")local lock = lib_lock:new(shd_name,{
["exptime"]=expire,["timeout"]=0,["step"]=0.001,["ratio"]=2,["max_step"]=0.5})-- try_locklocal elapsed, err = lock:lock(lock_name)if type(elapsed) ~= "nil" then-- got the lock :)-- some stuff :)return lock-- release the lock-- lock:unlock()endreturn nilend
local function test_lock_flaw(pf)if not type(pf) == "function" then
pf = ngx.sayendlocal function xpf(str)
assert(type(str) == "string")
pf(str)if pf == ngx.say thenngx.flush()endendxpf("enter:")local shd_name = "test_lock_flaw"local lock_ct = 0
local dic = ngx.shared[shd_name]dic:set("key:0","v:0")
while lock_ct < (10000000) dolocal lock = lock_try(shd_name,"lock_name"..tostring(math.random()),1)lock = nildic:get("key:0")lock_ct = lock_ct + 1end
xpf("exit,bye :)")end
test_lock_flaw(ngx.say)
(gdb) bt#0 sem_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/sem_wait.S:85#1 0x0000000000427438 in ngx_shmtx_lock (mtx=0x7fba9cd7d058) at src/core/ngx_shmtx.c:111#2 0x00000000004ca8f2 in ngx_http_lua_shdict_set_helper (L=0x41b20050, flags=0) at ../ngx_lua-0.10.5/src/ngx_http_lua_shdict.c:960#3 0x00007fba9fe3536b in lj_BC_FUNCC () from /usr/local/openresty/luajit/lib/libluajit-5.1.so.2#4 0x00007fba9fe37564 in gc_call_finalizer (g=g@entry=0x41b113b8, L=L@entry=0x41b20050, mo=mo@entry=0x7ffebd2f5080,o=o@entry=0x41420dd8) at lj_gc.c:466#5 0x00007fba9fe3774a in gc_finalize (L=L@entry=0x41b20050) at lj_gc.c:500#6 0x00007fba9fe384b3 in gc_onestep (L=L@entry=0x41b20050) at lj_gc.c:650#7 0x00007fba9fe38c6c in lj_gc_step (L=0x41b20050) at lj_gc.c:680#8 0x00007fba9fe46cda in lua_pushlstring (L=L@entry=0x41b20050, str=0x7fba9cd8d0d1 "v:0", len=len@entry=3) at lj_api.c:577#9 0x00000000004c9fce in ngx_http_lua_shdict_get_helper (L=0x41b20050, get_stale=0)at ../ngx_lua-0.10.5/src/ngx_http_lua_shdict.c:508#10 0x00007fba9fe3536b in lj_BC_FUNCC () from /usr/local/openresty/luajit/lib/libluajit-5.1.so.2#11 0x00007fba9fe950dc in lj_cf_package_require (L=0x41b20050) at lib_package.c:439#12 0x00007fba9fe3536b in lj_BC_FUNCC () from /usr/local/openresty/luajit/lib/libluajit-5.1.so.2#13 0x00000000004c2f43 in ngx_http_lua_run_thread (L=L@entry=0x41b11378, r=r@entry=0x7fbaa0862940, ctx=ctx@entry=0x7fbaa0855420,nrets=nrets@entry=0) at ../ngx_lua-0.10.5/src/ngx_http_lua_util.c:1015#14 0x00000000004c6b60 in ngx_http_lua_content_by_chunk (L=0x41b11378, r=0x7fbaa0862940)at ../ngx_lua-0.10.5/src/ngx_http_lua_contentby.c:112#15 0x00000000004c6691 in ngx_http_lua_content_handler (r=0x7fbaa0862940) at ../ngx_lua-0.10.5/src/ngx_http_lua_contentby.c:214#16 0x0000000000450cdf in ngx_http_core_content_phase (r=0x7fbaa0862940, ph=<optimized out>) at src/http/ngx_http_core_module.c:1381#17 0x000000000044b3d5 in ngx_http_core_run_phases (r=r@entry=0x7fbaa0862940) at src/http/ngx_http_core_module.c:858#18 0x000000000044b4c4 in ngx_http_handler (r=r@entry=0x7fbaa0862940) at src/http/ngx_http_core_module.c:841#19 0x00000000004570b9 in ngx_http_process_request (r=0x7fbaa0862940) at src/http/ngx_http_request.c:1912#20 0x00000000004576a8 in ngx_http_process_request_headers (rev=rev@entry=0x7fbaa08b82e0) at src/http/ngx_http_request.c:1344#21 0x0000000000457a66 in ngx_http_process_request_line (rev=0x7fbaa08b82e0) at src/http/ngx_http_request.c:1023#22 0x0000000000440e39 in ngx_epoll_process_events (cycle=0x7fbaa0851440, timer=<optimized out>, flags=<optimized out>)at src/event/modules/ngx_epoll_module.c:822#23 0x0000000000437527 in ngx_process_events_and_timers (cycle=cycle@entry=0x7fbaa0851440) at src/event/ngx_event.c:242#24 0x000000000043e805 in ngx_worker_process_cycle (cycle=cycle@entry=0x7fbaa0851440, data=data@entry=0x0)at src/os/unix/ngx_process_cycle.c:753#25 0x000000000043d2d0 in ngx_spawn_process (cycle=cycle@entry=0x7fbaa0851440, proc=proc@entry=0x43e7c0 <ngx_worker_process_cycle>,data=data@entry=0x0, name=name@entry=0x4fd975 "worker process", respawn=respawn@entry=-3) at src/os/unix/ngx_process.c:198#26 0x000000000043ea74 in ngx_start_worker_processes (cycle=cycle@entry=0x7fbaa0851440, n=1, type=type@entry=-3)at src/os/unix/ngx_process_cycle.c:358#27 0x000000000043f7af in ngx_master_process_cycle (cycle=cycle@entry=0x7fbaa0851440) at src/os/unix/ngx_process_cycle.c:130#28 0x000000000041a5d9 in main (argc=<optimized out>, argv=<optimized out>) at src/core/nginx.c:367(gdb) lbtC:ngx_http_lua_shdict_delete@lua/com/resty/lock.lua:66C:ngx_http_lua_shdict_get@lua/flaw_simple.lua:57@lua/flaw_simple.lua:65C:lj_cf_package_require=content_by_lua(nginx.conf:53):2(gdb)
worker_processes 1; #worker_rlimit_core 10000m; # for producing core filelua_shared_dict test_lock_flaw 10m;
location flaw_ordinary {content_by_lua 'require "flaw_ordinary"
';}location /is_alive{content_by_lua 'ngx.say("not dead lock yet :)")';}
local function test_ordinary_flaw()local ffi = require"ffi"local ffi_new = ffi.newif not (type(pf) == "function") then
pf = ngx.sayendlocal function xpf(str)assert(type(str) == "string")pf(str)if pf == ngx.say thenngx.flush()endendxpf("enter:")local shd_name = "test_lock_flaw"local lock_ct = 0local dic = ngx.shared[shd_name]
local function gc_fp(cdata)-- do gc stuff...
dic:set("key:0","v:0")
endlocal ctype = ffi.metatype(
"struct {int key;int v;}",{ __gc = gc_fp })xpf("type(ctype):"..type(ctype))local cdata1 = ffi_new(ctype)cdata1.key = 1cdata1.v = 1cdata1 = nil
local shd_name = "test_lock_flaw"local lock_ct = 0local dic = ngx.shared[shd_name]dic:set("key:0","v:0")while lock_ct < (10000000) do
local cdata1 = ffi_new(ctype)cdata1.key = 1cdata1.v = 1cdata1 = nil
dic:get("key:0")lock_ct = lock_ct + 1endxpf("exit,bye :)")end
test_ordinary_flaw(ngx.say)
init_by_lua 'require"resty.core"';
作为一般的 OpenResty 编程建议,应当避免在 __gc 方法中进行任何复杂操作,比如操作共享内存字典之类。__gc 中应当只进行 C
级别的资源的释放。进行复杂操作的风险很大,因为可能会在任意的上下文中调用。
Regards,
-agentzh
It is strongly recommended to always call the unlock() method to actively release the lock as soon as possible.
If the unlock() method is never called after this method call, the lock will get released when
- the current resty.lock object instance is collected automatically by the Lua GC.
- the exptime for the lock entry is reached.