\timing set enable_sort = false; set enable_hashjoin = false; explain select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7; \o /dev/null select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7; \o set enable_hashjoin=true; set enable_sort = true; explain select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7; \o /dev/null select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7; \o set enable_sort = false; explain select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7; \o /dev/null select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7; \o set enable_sort = true; explain analyze select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7; set enable_sort = false; explain analyze select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7; explain analyze select ename,kids.kname from emp,kids where kids.eno = emp.eno and emp.sal > 50000 explain analyze select ename,kids.kname from emp,kids where kids.eno = emp.eno and emp.sal > 50000 limit 1 explain select * from emp join kids using (eno); explain select * from dept join emp using(dno) join kids using (eno); #table definitions explain select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7; select ename,count(*) from emp,kids,dept where kids.eno = emp.eno and emp.dno = dept.dno and emp.sal > 50000 and dept.bldg = 1 group by ename having count(*) > 7 select * from (select ename,count(*) from emp,kids,dept where kids.eno = emp.eno and emp.dno = dept.dno and emp.sal > 50000 and dept.bldg = 1 group by ename having count(*) > 7) t join emp on emp.ename = t.ename; #emp 1M tuples #dept 100000 tuples #kids 3M tuplespsq #create tables, assuming data is in testdb pg_ctl -D testdb start psql test drop table kids; drop table emp; drop table dept; create table dept (dno int primary key, bldg int); insert into dept (dno, bldg) select x.id, (random() * 10)::int FROM generate_series(0,10000) AS x(id); create table emp (eno int primary key, dno int references dept(dno), sal int, ename varchar); insert into emp (eno, dno, sal, ename) select x.id, (random() * 10000)::int, (random() * 55000)::int, 'emp' || x.id from generate_series(0,1000000) AS x(id); create table kids (kno int primary key, eno int references emp(eno), kname varchar); insert into kids (kno,eno,kname) select x.id, (random() * 1000000)::int, 'kid' || x.id from generate_series(0,3000000) AS x(id); \timing #look at query plan explain select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7; #and run time explain analyze select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7; # try without hash join set enable_hashjoin=false; explain analyze select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7; # note long time spent doing sorts to disk > emacs testdb/postgresql.conf #change workmem from 1MB to 200MB pg_ctl -D pgdata reload # rerun above queries, with and without hash join -- note that both got a lot faster! #optionally, try without hash join and sort join (i.e INL) set enable_hashjoin=false; set enable sort=false; #try with an index create index i on emp(sal); explain select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7; #try with a clustered index cluster i on emp; explain select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7; #didn't change, why? need to create stats! vacuum analyze; #rerun, note it uses index scan #now look at some stats alter table emp alter column sal set statistics 10; analyze verbose emp select relpages,reltuples from pg_class where relname='emp'; relpages | reltuples ----------+------------- 70064 | 1.00182e+06 select count(*) from emp; count --------- 1000001 What's that about? 1. estimate selectivity of query like "where emp.sal < 30000' SELECT histogram_bounds FROM pg_stats WHERE tablename='emp' AND attname='sal'; lecture=# SELECT histogram_bounds FROM pg_stats WHERE tablename='emp' AND attname='sal'; histogram_bounds ---------------------------------------------------------------- {1,5650,11223,16752,22356,27672,33073,38544,44048,49560,54998} 5 buckets in their entirety, plus (30000-27672)/(33073-27672) = .431031 of 6th bucket 5.431031 * .1 * 1.00182e+06 = 544092 tuples Doesn't quite work out: lec4=# explain select count(*) from emp where sal < 30000; QUERY PLAN ------------------------------------------------------------------ Aggregate (cost=83947.98..83947.99 rows=1 width=0) -> Seq Scan on emp (cost=0.00..82586.70 rows=544513 width=0) Filter: (sal < 30000) (3 rows) select count(*) from emp where sal < 30000; count -------- 545473 So what's going on? Most common values SELECT null_frac, n_distinct, most_common_vals, most_common_freqs FROM pg_stats where tablename = 'emp' and attname='sal'; 0 | 53095 | {2821,5136,38926,39340,1760,2720,2917,7010,7490,7654} | {0.0002,0.0002,0.0002,0.0002,0.000166667,0.000166667,0.000166667,0.000166667,0.000166667,0.000166667} (1 row) Most common values represent 4*(.0002) + 6*(.000166667) = .0018 Histogram is remaining 1 - .0018 = .9982 Looking at most frequent values, sal < 30k encompasses 8 of them: .0002 * 2 + 6 * .000166667= .0014 (.5431031 * .9982 + .0014) * 1.00182e+06 = 544514 Most frequent values also used for equality predicates explain select count(*) from emp where sal = 2821; QUERY PLAN --------------------------------------------------------------- Aggregate (cost=82587.20..82587.21 rows=1 width=0) -> Seq Scan on emp (cost=0.00..82586.70 rows=200 width=0) Filter: (sal = 2821) (3 rows) otherwise lecture=# explain select count(*) from emp where sal = 2822; ; QUERY PLAN -------------------------------------------------------------- Aggregate (cost=82586.75..82586.76 rows=1 width=0) -> Seq Scan on emp (cost=0.00..82586.70 rows=19 width=0) Filter: (sal = 2822) (3 rows) lecture=# select count(*) from emp where sal = 2821; ; count ------- 22 (1 row) lecture=# select count(*) from emp where sal = 2822; ; count ------- 20 (1 row) Whats going on here? It computes the counts with a sample; sample was skewed.