CREATE OR REPLACE FUNCTION public.fn_create_db_files() RETURNS void AS
$$
DECLARE
v_function_name text := 'fn_create_db_files';
v_location int;
v_sql text;
v_db_oid text;
v_num_segments numeric;
v_skew_amount numeric;
v_version text;
BEGIN
v_location := 1000;
SELECT oid INTO v_db_oid
FROM pg_database
WHERE datname = current_database();
v_location := 2000;
v_sql := 'DROP VIEW IF EXISTS vw_file_skew';
v_location := 2100;
EXECUTE v_sql;
v_location := 2200;
v_sql := 'DROP EXTERNAL TABLE IF EXISTS db_files';
v_location := 2300;
EXECUTE v_sql;
v_location := 3000;
v_sql := 'CREATE EXTERNAL WEB TABLE db_files ' ||
'(segment_id int, relfilenode text, filename text, ' ||
'size numeric) ' ||
'execute E''ls -l $GP_SEG_DATADIR/base/' || v_db_oid ||
' | ' ||
'grep gpadmin | ' ||
E'awk {''''print ENVIRON["GP_SEGMENT_ID"] "\\t" $9 "\\t" ' ||
'ENVIRON["GP_SEG_DATADIR"] "/' || v_db_oid ||
E'/" $9 "\\t" $5''''}'' on all ' || 'format ''text''';
v_location := 3100;
EXECUTE v_sql;
v_location := 4000;
SELECT count(*) INTO v_num_segments
FROM gp_segment_configuration
WHERE preferred_role = 'p'
AND content >= 0;
v_location := 4100;
v_skew_amount := 1.2*(1/v_num_segments);
v_location := 4200;
SELECT CASE WHEN POSITION('Greenplum Database 5' in v) > 0 THEN 'GPDB5'
WHEN POSITION('Greenplum Database 4' in v) > 0 THEN 'GPDB4'
ELSE 'other' END INTO v_version
FROM version() AS v;
v_sql := 'CREATE OR REPLACE VIEW vw_file_skew AS ' ||
'SELECT schema_name, ' ||
'table_name, ' ||
'max(size)/sum(size) as largest_segment_percentage, ' ||
'sum(size) as total_size ' ||
'FROM ( ' ||
'SELECT n.nspname AS schema_name, ' ||
' c.relname AS table_name, ' ||
' sum(db.size) as size ' ||
' FROM db_files db ';
IF v_version = 'GPDB4' THEN
v_sql := v_sql || ' JOIN pg_class c ON ';
ELSE
v_sql := v_sql || ' JOIN gp_dist_random(''pg_class'') c ON ';
END IF;
v_sql := v_sql || ' (split_part(db.relfilenode, ''.'', 1))::text = c.relfilenode::text ' ||
' JOIN pg_namespace n ON c.relnamespace = n.oid ' ||
' WHERE c.relkind = ''r'' and c.relstorage not in (''x'', ''v'', ''f'')' ||
' GROUP BY n.nspname, c.relname, db.segment_id ' ||
') as sub ' ||
'GROUP BY schema_name, table_name ' ||
'HAVING sum(size) > 0 and max(size)/sum(size) > ' ||
v_skew_amount::text || ' ' ||
'ORDER BY largest_segment_percentage DESC, schema_name, ' ||
'table_name';
v_location := 4300;
EXECUTE v_sql;
EXCEPTION
WHEN OTHERS THEN
RAISE EXCEPTION '(%:%:%)', v_function_name, v_location, sqlerrm;
END;
$$
language plpgsql;
select fn_create_db_files();
SELECT *,pg_size_pretty(total_size) FROM vw_file_skew
where largest_segment_percentage > 0.20 and total_size > 200000000
ORDER BY total_size desc;
Hello Matias,
You can calculate size bases skew but row skew is just as good.
Did you know GPDB has a “hidden” columns called gp_segment_id that brings the segment id on the select?
So you can easily do:
Select gp_segment_id, count(*) num_row
From target_table
Group by 1
Order by 2 desc;
From the above you can calculate an skew index by comparing the segment with most rows to the average rows, or something like this.
Let me know if this helps.
Rgds,
Luis F R Macedo
Advisory Data Engineer & Business Development for Latam
VMware Tanzu Data
Call Me @ +55 11 98860 8596 (new)
Take care of the customers and the rest takes care of itself
--
You received this message because you are subscribed to the Google Groups "Greenplum Users" group.
To unsubscribe from this group and stop receiving emails from it, send an email to
gpdb-users+...@greenplum.org.
To view this discussion on the web visit
https://groups.google.com/a/greenplum.org/d/msgid/gpdb-users/f7be252f-ef6e-4948-a1ed-fc6b6eb2c0d1n%40greenplum.org.
Matias,
If you are working with the VMware Tanzu Greenplum the Greenplum Command Center (GPCC) has nice reports that likely gives all the information you are looking for.
If you are working with OSS you are on the right path. I would create a group of views that each focus on different aspects of the data you are looking for. Keep in mind that if your environment has many, many tables these queries will take time to run.
You can also leverage the pg_relation_size(<table/partition>) to get the size of the table. Keep in mind that partitioned tables you need to run this per partition and sum.
Pg_size_pretty() is also your friend.
I am usure what you mean by “table slowness”…
To view this discussion on the web visit https://groups.google.com/a/greenplum.org/d/msgid/gpdb-users/CAKKNfuL1_dOx6%3D1_As1PCN4Zye1CmZARyXctzCRjyGGqJq130w%40mail.gmail.com.
CREATE EXTERNAL WEB TABLE db_files_segments(segment_id INTEGER,relfilenode TEXT,filename TEXT,size NUMERIC,last_timestamp_change timestamp)EXECUTE E'find $GP_SEG_DATADIR/base/`PGOPTIONS="-c gp_session_role=utility" psql -p $GP_SEG_PORT template1 -At -c"select oid from pg_database where datname=\'${GP_DATABASE}\';"` -type f -printf "$GP_SEGMENT_ID|%f|%h/%f|%s|%TY-%Tm-%Td %TX\n" 2> /dev/null || true' ON ALLFORMAT 'text' (delimiter '|' null E'\\N' escape E'\\')ENCODING 'UTF8';
