"COPY sub_table FROM ...." will trigger "vacuum()->analyze_rel()" on the parent of the partition table. So if we process different subtables for the same partition table at the same time in parallel, it may hang forever.
I wonder is there any solution to fix this problem? Or any GUC setting to workaround? Many thanks in advance!
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
1. The table "sales" is a partition table which has some sub tables:
2. When running "gpcopy --parallelize-leaf-partitions=true", it will process 2 sub tables at the same time, the log is:
3. Then the ps has belows output:
4. The related bt of the process is:
(1) One hanging query
gpadmin 66665 87875 0 14:37 ? 00:00:00 postgres: 6000, gpadmin testdb 192.168.2.11(62826) con6925 cmd16 COPY
(gdb) bt
#0 0x00007f121a55ea3d in poll () from /lib64/libc.so.6
#1 0x0000000000b88524 in checkDispatchResult (ds=0x1ee03b8, wait=1 '\001') at cdbdisp_async.c:486
#2 0x0000000000b8817b in cdbdisp_checkDispatchResult_async (ds=0x1ee03b8, waitMode=DISPATCH_WAIT_NONE) at cdbdisp_async.c:333
#3 0x0000000000b87007 in cdbdisp_checkDispatchResult (ds=0x1ee03b8, waitMode=DISPATCH_WAIT_NONE) at cdbdisp.c:108
#4 0x0000000000b8c028 in cdbdisp_dispatchCommandInternal (pQueryParms=0x1f4a808, flags=4, segments=0x1f5b8a0, cdb_pgresults=0x7ffd6d9b6e80)
at cdbdisp_query.c:446
#5 0x0000000000b8be59 in CdbDispatchCommandToSegments (
strCommand=0x1f5b8d8 "select * from pg_catalog.gp_acquire_sample_rows(97607, 10000, 't') as (totalrows pg_catalog.float8, totaldeadrows pg_catalog.float8, oversized_cols_bitmap pg_catalog.text, id integer, date date, amt n"..., flags=4, segments=0x1f5b8a0, cdb_pgresults=0x7ffd6d9b6e80) at cdbdisp_query.c:366
#6 0x0000000000b8bda8 in CdbDispatchCommand (
strCommand=0x1f5b8d8 "select * from pg_catalog.gp_acquire_sample_rows(97607, 10000, 't') as (totalrows pg_catalog.float8, totaldeadrows pg_catalog.float8, oversized_cols_bitmap pg_catalog.text, id integer, date date, amt n"..., flags=4, cdb_pgresults=0x7ffd6d9b6e80) at cdbdisp_query.c:338
#7 0x0000000000672e1e in acquire_sample_rows_dispatcher (onerel=0x7f121c2087c8, inh=1 '\001', elevel=13, rows=0x1f7fbf8, targrows=30000,
totalrows=0x7ffd6d9b7168, totaldeadrows=0x7ffd6d9b7160) at analyze.c:2442
#8 0x0000000000671f8a in acquire_inherited_sample_rows (onerel=0x7f121c2087c8, elevel=13, rows=0x1f7fbf8, targrows=30000, totalrows=0x7ffd6d9b7168,
totaldeadrows=0x7ffd6d9b7160) at analyze.c:2014
#9 0x000000000066f49f in do_analyze_rel (onerel=0x7f121c2087c8, vacstmt=0x1f560c0, acquirefunc=0x6716ff <acquire_sample_rows>, relpages=0, inh=1 '\001',
in_outer_xact=1 '\001', elevel=13) at analyze.c:679
#10 0x000000000066eade in analyze_rel_internal (relid=97607, vacstmt=0x1f560c0, in_outer_xact=1 '\001', bstrategy=0x1f37308) at analyze.c:405
#11 0x000000000066e48e in
analyze_rel (relid=
97607, vacstmt=0x1f560c0, in_outer_xact=1 '\001', bstrategy=0x1f37308) at analyze.c:223
#12 0x000000000071a144 in vacuum (vacstmt=0x1f560c0, relid=0, do_toast=0 '\000', bstrategy=0x1f37308, for_wraparound=0 '\000', isTopLevel=0 '\000')
at vacuum.c:387
#13 0x00000000008e07b9 in
autostats_issue_analyze (relationOid=
97615) at autostats.c:74
#14 0x00000000008e0d3b in auto_stats (cmdType=AUTOSTATS_CMDTYPE_COPY, relationOid=97615, ntuples=1, inFunction=0 '\000') at autostats.c:324
#15 0x0000000000683f37 in DoCopy (stmt=0x1e35540,
queryString=0x1e34678 "COPY \"public\".\"sales_1_prt_feb17\" FROM PROGRAM 'gpcopy_helper --listen --seg-id <SEGID> --cmd-id 9659f97f-03d9-4e1c-b6d4-68d47662f058' ON SEGMENT CSV", processed=0x7ffd6d9b7890) at copy.c:1173
#16 0x000000000096dd01 in standard_ProcessUtility (parsetree=0x1e35540,
queryString=0x1e34678 "COPY \"public\".\"sales_1_prt_feb17\" FROM PROGRAM 'gpcopy_helper --listen --seg-id <SEGID> --cmd-id 9659f97f-03d9-4e1c-b6d4-68d47662f058' ON SEGMENT CSV", context=PROCESS_UTILITY_TOPLEVEL, params=0x0, dest=0x1e35928, completionTag=0x7ffd6d9b7be0 "") at utility.c:634
#17 0x000000000096d677 in ProcessUtility (parsetree=0x1e35540,
queryString=0x1e34678 "COPY \"public\".\"sales_1_prt_feb17\" FROM PROGRAM 'gpcopy_helper --listen --seg-id <SEGID> --cmd-id 9659f97f-03d9-4e1c-b6d4-68d47662f058' ON SEGMENT CSV", context=PROCESS_UTILITY_TOPLEVEL, params=0x0, dest=0x1e35928, completionTag=0x7ffd6d9b7be0 "") at utility.c:373
#18 0x000000000096c44a in PortalRunUtility (portal=0x1e3c238, utilityStmt=0x1e35540, isTopLevel=1 '\001', dest=0x1e35928, completionTag=0x7ffd6d9b7be0 "")
at pquery.c:1379
#19 0x000000000096c6db in PortalRunMulti (portal=0x1e3c238, isTopLevel=1 '\001', dest=0x1e35928, altdest=0x1e35928, completionTag=0x7ffd6d9b7be0 "")
at pquery.c:1510
#20 0x000000000096bbd2 in PortalRun (portal=0x1e3c238, count=9223372036854775807, isTopLevel=1 '\001', dest=0x1e35928, altdest=0x1e35928,
completionTag=0x7ffd6d9b7be0 "") at pquery.c:1016
#21 0x00000000009640d2 in exec_simple_query (
query_string=0x1e34678 "COPY \"public\".\"sales_1_prt_feb17\" FROM PROGRAM 'gpcopy_helper --listen --seg-id <SEGID> --cmd-id 9659f97f-03d9-4e1c-b6d4-68d47662f058' ON SEGMENT CSV") at postgres.c:1798
#22 0x0000000000968e8c in PostgresMain (argc=1, argv=0x1e14228, dbname=0x1e13fd0 "testdb", username=0x1e14168 "gpadmin") at postgres.c:5215
#23 0x00000000008d7c40 in BackendRun (port=0x1e3c040) at postmaster.c:4803
#24 0x00000000008d72e0 in BackendStartup (port=0x1e3c040) at postmaster.c:4460
#25 0x00000000008d32ab in ServerLoop () at postmaster.c:1945
#26 0x00000000008d27fd in PostmasterMain (argc=6, argv=0x1e12020) at postmaster.c:1515
#27 0x00000000007cdc32 in main (argc=6, argv=0x1e12020) at main.c:245
It waiting for the result of below QEs:
gpadmin 66673 87843 0 14:37 ? 00:00:00 postgres: 6002, gpadmin testdb 192.168.2.12(47272) con6925 seg0 cmd16 SELECT waiting
gpadmin 66674 87844 0 14:37 ? 00:00:00 postgres: 6003, gpadmin testdb 192.168.2.12(51614) con6925 seg1 cmd16 SELECT waiting
gpadmin 66675 87845 0 14:37 ? 00:00:00 postgres: 6004, gpadmin testdb 192.168.2.12(52646) con6925 seg2 cmd16 SELECT waiting
gdb -p 66673
(gdb) bt
#0 0x00007f3d9b5c2b47 in semop () from /lib64/libc.so.6
#1 0x00000000008bec73 in PGSemaphoreLock (sema=0x7f3d8e6b7380, interruptOK=1 '\001') at pg_sema.c:422
#2 0x000000000094e901 in ProcSleep (locallock=0x20d9f10, lockMethodTable=0xdc0560 <default_lockmethod>) at proc.c:1338
#3 0x00000000009477cf in WaitOnLock (locallock=0x20d9f10, owner=0x20d4a88) at lock.c:1849
#4 0x000000000094654e in LockAcquireExtended (locktag=0x7ffff3878600, lockmode=1, sessionLock=0 '\000', dontWait=0 '\000', reportMemoryError=1 '\001',
locallockp=0x7ffff38785f8) at lock.c:1151
#5 0x00000000009438d5 in
LockRelationOid (relid=
97623, lockmode=1) at lmgr.c:102
#6 0x00000000005e1a85 in find_inheritance_children (parentrelId=97607, lockmode=1) at pg_inherits.c:121
#7 0x00000000005e1c4b in find_all_inheritors (parentrelId=97607, lockmode=1, numparents=0x0) at pg_inherits.c:211
#8 0x0000000000671faa in acquire_inherited_sample_rows (onerel=0x7f3d9d1d4a98, elevel=14, rows=0x217b838, targrows=10000, totalrows=0x7ffff3878860,
totaldeadrows=0x7ffff3878858) at analyze.c:2024
#9 0x00000000007259c8 in gp_acquire_sample_rows (fcinfo=0x7ffff3878a90) at analyzefuncs.c:202
#10 0x0000000000743002 in ExecMakeTableFunctionResult (funcexpr=0x2170660, econtext=0x216ff10, argContext=0x20d5230, expectedDesc=0x2170fd0,
randomAccess=0 '\000', operatorMemKB=32768) at execQual.c:2337
#11 0x000000000077531b in FunctionNext_guts (node=0x216faa0) at nodeFunctionscan.c:103
#12 0x0000000000775925 in FunctionNext (node=0x216faa0) at nodeFunctionscan.c:296
#13 0x000000000074cf1f in ExecScanFetch (node=0x216faa0, accessMtd=0x77590d <FunctionNext>, recheckMtd=0x77595a <FunctionRecheck>) at execScan.c:84
#14 0x000000000074cf8a in ExecScan (node=0x216faa0, accessMtd=0x77590d <FunctionNext>, recheckMtd=0x77595a <FunctionRecheck>) at execScan.c:132
#15 0x000000000077598f in ExecFunctionScan (node=0x216faa0) at nodeFunctionscan.c:326
#16 0x000000000073e12c in ExecProcNode (node=0x216faa0) at execProcnode.c:1043
#17 0x00000000007387d2 in ExecutePlan (estate=0x216f7b8, planstate=0x216faa0, operation=CMD_SELECT, sendTuples=1 '\001', numberTuples=0,
direction=ForwardScanDirection, dest=0x216b020) at execMain.c:2979
#18 0x0000000000735049 in standard_ExecutorRun (queryDesc=0x215aab8, direction=ForwardScanDirection, count=0) at execMain.c:968
#19 0x0000000000734cc1 in ExecutorRun (queryDesc=0x215aab8, direction=ForwardScanDirection, count=0) at execMain.c:834
#20 0x000000000096be92 in PortalRunSelect (portal=0x21486a8, forward=1 '\001', count=0, dest=0x216b020) at pquery.c:1149
#21 0x000000000096bb08 in PortalRun (portal=0x21486a8, count=9223372036854775807, isTopLevel=1 '\001', dest=0x216b020, altdest=0x216b020,
completionTag=0x7ffff38795a0 "") at pquery.c:990
#22 0x00000000009640d2 in exec_simple_query (
query_string=0x213dd94 "select * from pg_catalog.
gp_acquire_sample_rows(
97607, 10000, 't') as (totalrows pg_catalog.float8, totaldeadrows pg_catalog.float8, oversized_cols_bitmap pg_catalog.text, id integer, date date, amt n"...) at postgres.c:1798
#23 0x00000000009693b7 in PostgresMain (argc=1, argv=0x20ac968, dbname=0x20ac840 "testdb", username=0x20ac820 "gpadmin") at postgres.c:5357
#24 0x00000000008d7c40 in BackendRun (port=0x20dcaf0) at postmaster.c:4803
#25 0x00000000008d72e0 in BackendStartup (port=0x20dcaf0) at postmaster.c:4460
#26 0x00000000008d32ab in ServerLoop () at postmaster.c:1945
#27 0x00000000008d27fd in PostmasterMain (argc=5, argv=0x20aa890) at postmaster.c:1515
#28 0x00000000007cdc32 in main (argc=5, argv=0x20aa890) at main.c:245
gpadmin 66667 87875 0 14:37 ? 00:00:00 postgres: 6000, gpadmin testdb 192.168.2.11(62828) con6926 cmd10 COPY waiting
(gdb) bt
#0 0x00007f121a56ab47 in semop () from /lib64/libc.so.6
#1 0x00000000008bec73 in PGSemaphoreLock (sema=0x7f120ec28860, interruptOK=1 '\001') at pg_sema.c:422
#2 0x000000000094e901 in ProcSleep (locallock=0x1e41900, lockMethodTable=0xdc0560 <default_lockmethod>) at proc.c:1338
#3 0x00000000009477cf in WaitOnLock (locallock=0x1e41900, owner=0x1e8be90) at lock.c:1849
#4 0x000000000094654e in LockAcquireExtended (locktag=0x7ffd6d9b7260, lockmode=4, sessionLock=0 '\000', dontWait=0 '\000', reportMemoryError=1 '\001',
locallockp=0x7ffd6d9b7258) at lock.c:1151
#5 0x00000000009438d5 in LockRelationOid (relid=97607, lockmode=4) at lmgr.c:102
#6 0x00000000004df9ab in try_relation_open (relationId=97607, lockmode=4, noWait=0 '\000') at heapam.c:1126
#7 0x000000000066e603 in analyze_rel_internal (relid=97607, vacstmt=0x1e3f340, in_outer_xact=1 '\001', bstrategy=0x1f092f8) at analyze.c:269
#8 0x000000000066e48e in analyze_rel (relid=97607, vacstmt=0x1e3f340, in_outer_xact=1 '\001', bstrategy=0x1f092f8) at analyze.c:223
#9 0x000000000071a144 in vacuum (vacstmt=0x1e3f340, relid=0, do_toast=0 '\000', bstrategy=0x1f092f8, for_wraparound=0 '\000', isTopLevel=0 '\000')
at vacuum.c:387
#10 0x00000000008e07b9 in autostats_issue_analyze (relationOid=97623) at autostats.c:74
#11 0x00000000008e0d3b in auto_stats (cmdType=AUTOSTATS_CMDTYPE_COPY, relationOid=97623, ntuples=1, inFunction=0 '\000') at autostats.c:324
#12 0x0000000000683f37 in DoCopy (stmt=0x1e35540,
queryString=0x1e34678 "COPY \"public\".\"sales_1_prt_apr17\" FROM PROGRAM 'gpcopy_helper --listen --seg-id <SEGID> --cmd-id e6252549-c329-4b88-a203-b80ac9ff80dd' ON SEGMENT CSV", processed=0x7ffd6d9b7890) at copy.c:1173
#13 0x000000000096dd01 in standard_ProcessUtility (parsetree=0x1e35540,
queryString=0x1e34678 "COPY \"public\".\"sales_1_prt_apr17\" FROM PROGRAM 'gpcopy_helper --listen --seg-id <SEGID> --cmd-id e6252549-c329-4b88-a203-b80ac9ff80dd' ON SEGMENT CSV", context=PROCESS_UTILITY_TOPLEVEL, params=0x0, dest=0x1e35928, completionTag=0x7ffd6d9b7be0 "") at utility.c:634
#14 0x000000000096d677 in ProcessUtility (parsetree=0x1e35540,
queryString=0x1e34678 "COPY \"public\".\"sales_1_prt_apr17\" FROM PROGRAM 'gpcopy_helper --listen --seg-id <SEGID> --cmd-id e6252549-c329-4b88-a203-b80ac9ff80dd' ON SEGMENT CSV", context=PROCESS_UTILITY_TOPLEVEL, params=0x0, dest=0x1e35928, completionTag=0x7ffd6d9b7be0 "") at utility.c:373
#15 0x000000000096c44a in PortalRunUtility (portal=0x1edfb08, utilityStmt=0x1e35540, isTopLevel=1 '\001', dest=0x1e35928, completionTag=0x7ffd6d9b7be0 "")
at pquery.c:1379
#16 0x000000000096c6db in PortalRunMulti (portal=0x1edfb08, isTopLevel=1 '\001', dest=0x1e35928, altdest=0x1e35928, completionTag=0x7ffd6d9b7be0 "")
at pquery.c:1510
#17 0x000000000096bbd2 in PortalRun (portal=0x1edfb08, count=9223372036854775807, isTopLevel=1 '\001', dest=0x1e35928, altdest=0x1e35928,
completionTag=0x7ffd6d9b7be0 "") at pquery.c:1016
#18 0x00000000009640d2 in exec_simple_query (
query_string=0x1e34678 "COPY \"public\".\"sales_1_prt_apr17\" FROM PROGRAM 'gpcopy_helper --listen --seg-id <SEGID> --cmd-id e6252549-c329-4b88-a203-b80ac9ff80dd' ON SEGMENT CSV") at postgres.c:1798
#19 0x0000000000968e8c in PostgresMain (argc=1, argv=0x1e14228, dbname=0x1e140c8 "testdb", username=0x1e13fd0 "gpadmin") at postgres.c:5215
#20 0x00000000008d7c40 in BackendRun (port=0x1e3c040) at postmaster.c:4803
#21 0x00000000008d72e0 in BackendStartup (port=0x1e3c040) at postmaster.c:4460
#22 0x00000000008d32ab in ServerLoop () at postmaster.c:1945
#23 0x00000000008d27fd in PostmasterMain (argc=6, argv=0x1e12020) at postmaster.c:1515
#24 0x00000000007cdc32 in main (argc=6, argv=0x1e12020) at main.c:245
5. gpcopy continue if I cancel one of the hang query: