\timing
set enable_sort = false;
set enable_hashjoin = false;
explain select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7;

\o /dev/null
select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7;
\o


set enable_hashjoin=true;
set enable_sort = true;

explain select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7;

\o /dev/null
select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7;
\o

set enable_sort = false;

explain select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7;

\o /dev/null
select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7;
\o

set enable_sort = true;

explain analyze select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7;

set enable_sort = false;

explain analyze select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7;


explain analyze select ename,kids.kname from emp,kids where kids.eno = emp.eno and emp.sal > 50000

explain analyze select ename,kids.kname from emp,kids where kids.eno = emp.eno and emp.sal > 50000 limit 1


explain select * from  emp  join kids using (eno);
explain select * from dept join emp using(dno) join kids using (eno);


#table definitions

explain select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7;

select ename,count(*) from emp,kids,dept where kids.eno = emp.eno and emp.dno = dept.dno and emp.sal > 50000 and dept.bldg = 1 group by ename having count(*) > 7

select * from (select ename,count(*) from emp,kids,dept where kids.eno = emp.eno and emp.dno = dept.dno and emp.sal > 50000 and dept.bldg = 1 group by ename having count(*) > 7) t join emp on emp.ename = t.ename;


#emp 1M tuples
#dept 100000 tuples
#kids 3M tuplespsq


#create tables, assuming data is in testdb
pg_ctl -D testdb start
psql test

drop table kids;
drop table emp;
drop table dept;

create table dept (dno int primary key, bldg int);

insert into dept (dno, bldg) select x.id, (random() * 10)::int FROM generate_series(0,10000) AS x(id);

create table emp (eno int primary key, dno int references dept(dno), sal int, ename varchar);

insert into emp (eno, dno, sal, ename) select x.id, (random() * 10000)::int, (random() * 55000)::int, 'emp' || x.id from generate_series(0,1000000) AS x(id);


create table kids (kno int primary key, eno int references emp(eno), kname varchar);

insert into kids (kno,eno,kname) select x.id, (random() * 1000000)::int, 'kid' || x.id from generate_series(0,3000000) AS x(id);

\timing


#look at query plan

explain select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7;

#and run time

explain analyze select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7;


# try without hash join
set enable_hashjoin=false;


explain analyze select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7;


# note long time spent doing sorts to disk

> emacs testdb/postgresql.conf

#change workmem from 1MB to 200MB


pg_ctl -D pgdata reload


# rerun above queries, with and without hash join -- note that both got a lot faster!


#optionally, try without hash join and sort join (i.e INL)

set enable_hashjoin=false;
set enable sort=false;


#try with an index

create index i on emp(sal);

explain select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7;

#try with a clustered index

cluster i on emp;

explain select ename,count(*) from emp,kids where kids.eno = emp.eno and emp.sal > 50000 group by ename having count(*) > 7;

#didn't change, why?  need to create stats!

vacuum analyze;


#rerun, note it uses index scan


#now look at some stats

alter table emp alter column sal set statistics 10;

analyze verbose emp


select relpages,reltuples from pg_class where relname='emp';	

 relpages |  reltuples  
----------+-------------
    70064 | 1.00182e+06

select count(*) from emp;

  count  
---------
 1000001

What's that about?

1. estimate selectivity of query like "where emp.sal < 30000'

SELECT histogram_bounds FROM pg_stats 
WHERE tablename='emp' AND attname='sal';

lecture=# SELECT histogram_bounds FROM pg_stats 
WHERE tablename='emp' AND attname='sal';

                        histogram_bounds                        
----------------------------------------------------------------
 {1,5650,11223,16752,22356,27672,33073,38544,44048,49560,54998}


5 buckets in their entirety, plus
(30000-27672)/(33073-27672) = .431031 of 6th bucket

5.431031 * .1 * 1.00182e+06 = 544092 tuples

Doesn't quite work out:

lec4=# explain select count(*) from emp where sal < 30000;
                            QUERY PLAN                            
------------------------------------------------------------------
 Aggregate  (cost=83947.98..83947.99 rows=1 width=0)
   ->  Seq Scan on emp  (cost=0.00..82586.70 rows=544513 width=0)
         Filter: (sal < 30000)
(3 rows)


select count(*) from emp where sal < 30000;
 count  
--------
 545473


So what's going on?

Most common values 

SELECT null_frac, n_distinct, most_common_vals, most_common_freqs FROM pg_stats
 where tablename = 'emp' and attname='sal';


         0 |      53095 | {2821,5136,38926,39340,1760,2720,2917,7010,7490,7654} | {0.0002,0.0002,0.0002,0.0002,0.000166667,0.000166667,0.000166667,0.000166667,0.000166667,0.000166667}
(1 row)

Most common values represent

4*(.0002) + 6*(.000166667)  = .0018

Histogram is remaining 
1 - .0018 = .9982


Looking at most frequent values, sal < 30k encompasses 8 of them:

 .0002 * 2 + 6 * .000166667= .0014

(.5431031 * .9982 + .0014) * 1.00182e+06 = 544514


Most frequent values also used for equality predicates

explain select count(*) from emp where sal = 2821;

                          QUERY PLAN                           
---------------------------------------------------------------
 Aggregate  (cost=82587.20..82587.21 rows=1 width=0)
   ->  Seq Scan on emp  (cost=0.00..82586.70 rows=200 width=0)
         Filter: (sal = 2821)
(3 rows)


otherwise

lecture=# explain select count(*) from emp where sal = 2822;
;
                          QUERY PLAN                          
--------------------------------------------------------------
 Aggregate  (cost=82586.75..82586.76 rows=1 width=0)
   ->  Seq Scan on emp  (cost=0.00..82586.70 rows=19 width=0)
         Filter: (sal = 2822)
(3 rows)

lecture=# select count(*) from emp where sal = 2821;
;
 count 
-------
    22
(1 row)

lecture=# select count(*) from emp where sal = 2822;
;
 count 
-------
    20
(1 row)


Whats going on here?  It computes the counts with a sample;  sample was skewed.