WITH RECURSIVE option in GPDB

EManchester

unread,

Mar 3, 2016, 11:06:20 AM3/3/16

to Greenplum Users

Hi all,

So this works in PostgreSQL:

create table employee (
empid integer,
firstname text,
lastname text,
managerid integer
);

insert into employee values (101,'John','Doe',NULL);
insert into employee values (102,'Jo','Doe',101);
insert into employee values (103,'Jonny','Doe',101);
insert into employee values (104,'Jennie','Doe',101);
insert into employee values (201,'Bob','Smith',NULL);
insert into employee values (202,'Betty','Smith',201);
insert into employee values (203,'Brent','Smith',201);
insert into employee values (204,'Brad','Smith',201);

cat query.sql
with recursive EmpList
as ( select EmpID, FirstName, LastName, ManagerID
     from Employee
     where ManagerID is NULL
     union all
       select Employee.EmpID, Employee.FirstName, Employee.LastName, Employee.ManagerID
       from Employee
       inner join EmpList ON Employee.ManagerID = EmpList.EmpID
       where Employee.ManagerID is NOT NULL )
select *
from EmpList;

psql -V
psql (PostgreSQL) 9.4.3

psql -U postgres -d test -f query.sql
empid | firstname | lastname | managerid
-------+-----------+----------+-----------
   101 | John      | Doe      |
   201 | Bob       | Smith    |
   102 | Jo        | Doe      |       101
   103 | Jonny     | Doe      |       101
   104 | Jennie    | Doe      |       101
   202 | Betty     | Smith    |       201
   203 | Brent     | Smith    |       201
   204 | Brad      | Smith    |       201
(8 rows)

But when I create the same table, distributed by (empid) and try the same query in GPDB, it doesn't work:

[gpadmin@gpdb ~]$ gpstate | grep local
20160303:08:51:51:002662 gpstate:gpdb:gpadmin-[INFO]:-local Greenplum Version: 'postgres (Greenplum Database) 4.3.7.0 build 2'

[gpadmin@gpdb ~]$ psql -d test -f query.sql
psql:query.sql:11: ERROR: RECURSIVE option in WITH clause is not supported

Is there an easy way to achieve the same ordering (NULLs first) in Greenplum, without the recursive option?

cat query2.sql
select EmpID, FirstName, LastName, ManagerID
from Employee
where ManagerID is NULL
union all
select Employee.EmpID, Employee.FirstName, Employee.LastName, Employee.ManagerID
from Employee
where Employee.ManagerID is NOT NULL
order by ManagerID asc;

psql -f query2.sql
empid | firstname | lastname | managerid
-------+-----------+----------+-----------
   104 | Jennie    | Doe      |       101
   103 | Jonny     | Doe      |       101
   102 | Jo        | Doe      |       101
   203 | Brent     | Smith    |       201
   202 | Betty     | Smith    |       201
   204 | Brad      | Smith    |       201
   201 | Bob       | Smith    |
   101 | John      | Doe      |
(8 rows)

Heikki Linnakangas

unread,

Mar 3, 2016, 11:17:06 AM3/3/16

to EManchester, Greenplum Users

On 03/03/16 18:06, EManchester wrote:
> Is there an easy way to achieve the same ordering (NULLs first) in
> Greenplum, without the recursive option?
>
> cat query2.sql
> select EmpID, FirstName, LastName, ManagerID
> from Employee
> where ManagerID is NULL
> union all
> select Employee.EmpID, Employee.FirstName, Employee.LastName,
> Employee.ManagerID
> from Employee
> where Employee.ManagerID is NOT NULL
> order by ManagerID asc;

Hmm. This isn't strictly-speaking the same as the recursive query. But
yes, you can easily achieve NULLS FIRST ordering for that. The git
master version of GPDB supports the SQL standard NULLS FIRST clause, so
you can just add NULLS FIRST in the ORDER BY. In earlier versions you
achieve the same with:

select * from (

select EmpID, FirstName, LastName, ManagerID
from Employee
where ManagerID is NULL
union all
select Employee.EmpID, Employee.FirstName, Employee.LastName,
Employee.ManagerID
from Employee

where Employee.ManagerID is NOT NULL

) t order by managerid is not null, managerid asc;

- Heikki

Rupendra Bandyopadhyay

unread,

Mar 3, 2016, 11:22:15 AM3/3/16

to EManchester, Greenplum Users

Maybe write the query like below and ignore the extra column in the output?

```

select EmpID, FirstName, LastName, ManagerID, coalesce(Managerid,-1)

from Employee

where ManagerID is NULL

union all

select Employee.EmpID, Employee.FirstName, Employee.LastName, Employee.ManagerID, coalesce(Employee.ManagerID, 1)

from Employee

where Employee.ManagerID is NOT NULL

order by 5 asc;

empid | firstname | lastname | managerid | coalesce

-------+-----------+----------+-----------+----------

201 | Bob | Smith | | -1

101 | John | Doe | | -1

103 | Jonny | Doe | 101 | 101

102 | Jo | Doe | 101 | 101

104 | Jennie | Doe | 101 | 101

202 | Betty | Smith | 201 | 201

204 | Brad | Smith | 201 | 201

203 | Brent | Smith | 201 | 201

(8 rows)

```

--
You received this message because you are subscribed to the Google Groups "Greenplum Users" group.
To unsubscribe from this group and stop receiving emails from it, send an email to gpdb-users+...@greenplum.org.
To post to this group, send email to gpdb-...@greenplum.org.
Visit this group at https://groups.google.com/a/greenplum.org/group/gpdb-users/.
For more options, visit https://groups.google.com/a/greenplum.org/d/optout.

EManchester

unread,

Mar 3, 2016, 11:25:57 AM3/3/16

to Greenplum Users, emanche...@gmail.com

Thanks for the quick answer. It solves this specific problem.

[gpadmin@gpdb ~]$ psql -f query2.sql

empid | firstname | lastname | managerid
-------+-----------+----------+-----------

101 | John | Doe |
201 | Bob | Smith |

   104 | Jennie    | Doe      |       101
   103 | Jonny     | Doe      |       101
   102 | Jo        | Doe      |       101
   203 | Brent     | Smith    |       201
   202 | Betty     | Smith    |       201
   204 | Brad      | Smith    |       201

(8 rows)

Will Greenplum support WITH RECURSIVE at some point in the future? Is it on the roadmap?

Jon Roberts

unread,

Mar 3, 2016, 11:29:25 AM3/3/16

to EManchester, Greenplum Users

UNION will make the database read the employee table twice which is slower and less efficient that simply changing the order by.

select EmpID, FirstName, LastName, ManagerID

from Employee

order by coalesce(managerid, -1);

I'm not sure on the with recursive timeline but Greenplum is codebase is getting upgraded to 8.3, then 8.4, and so on. I can't imagine it ever performing very well because of the nature of the feature seems to demand a nested loop which is known to be slow for analytics.

Jon Roberts

On Thu, Mar 3, 2016 at 10:25 AM, EManchester <emanche...@gmail.com> wrote:

EManchester

unread,

Mar 3, 2016, 11:41:50 AM3/3/16

to Greenplum Users, emanche...@gmail.com

Slick solution. Thanks for providing it.

[gpadmin@gpdb ~]$ cat query3.sql

select EmpID, FirstName, LastName, ManagerID
from Employee
order by coalesce(managerid, -1);

[gpadmin@gpdb ~]$ psql -f query3.sql

empid | firstname | lastname | managerid
-------+-----------+----------+-----------
   101 | John      | Doe      |
   201 | Bob       | Smith    |
   104 | Jennie    | Doe      |       101
   103 | Jonny     | Doe      |       101
   102 | Jo        | Doe      |       101
   203 | Brent     | Smith    |       201
   202 | Betty     | Smith    |       201
   204 | Brad      | Smith    |       201
(8 rows)

Having WITH RECURSIVE would help support SQL migrations from other platforms, at least. Agreed that there would be a performance lag due to the repetitive reads with the union and the use of a nested loop.

Thanks.

Srivatsan Ramanujam

unread,

Mar 11, 2016, 8:41:05 PM3/11/16

to EManchester, Greenplum Users

Not sure if this is of interest to you, but while we wait on the RECURSIVE functionality in Greenplum, the following is a simple recursive query implemented through a PL/Python driver function:

--1) Write UDF to recursively explode a table
create or replace function explode_table(input_table text)
returns text
as
$$
    import plpy
    sql = """
        drop table if exists exploded_table;
        create table exploded_table
        (
             part_id int,
             sub_part_id int,
             subpart_attribute text
        ) distributed randomly;
    """
    plpy.execute(sql)
    explosion_complete = False
    while(not explosion_complete):
        sql  = """
            drop temp table if exists exploded_temp;
            create temp table exploded_temp
            as
            (
                select
                    t1.sub_part_id as part_id,
                    t2.part_id as sub_part_id,
                    t2.subpart_attribute
                from
                    {input_table} t1,
                    {input_table} t2
                where
                     t1.sub_part_id = t2.part_id
            ) distributed randomly;
        """.format(
            input_table=input_table
        )
        plpy.execute(sql)

        num_rows_sql= """
            select
                count(*) as num_rows
            from
                exploded_temp
        """
        num_rows = plpy.execute(num_rows_sql)[0]['num_rows']
        if num_rows == 0:
            explosion_complete = True
        else:
            sql = """
                insert into exploded_table
                select
                    *
                from
                    exploded_temp;
            """
            plpy.execute(sql)
    return 'Successfully exploded {0}'.format(input_table)
$$language plpythonu;

--2) Invoke the UDF to explode the table
select
    explode_table('parts_subparts_relation_table');

--

You received this message because you are subscribed to the Google Groups "Greenplum Users" group.
To unsubscribe from this group and stop receiving emails from it, send an email to gpdb-users+...@greenplum.org.
To post to this group, send email to gpdb-...@greenplum.org.
Visit this group at https://groups.google.com/a/greenplum.org/group/gpdb-users/.
For more options, visit https://groups.google.com/a/greenplum.org/d/optout.

--

____________________________________

Srivatsan Ramanujam | Data Science

Pivotal HQ - Palo Alto, CA

Mobile: 650-483-5630

____________________________________

Reply all

Reply to author

Forward