drop table employee;   --statement 1
create table employee (   --statement 2
   empid bigint,
   fname varchar(40),
   lname varchar(40),
   email varchar(40),
   dob date,
   depid int,   --- department id
   salary decimal(18,8),
   managerid bigint --- manager id, who is also an employee.
);

create table department (
   depid int,
   name varchar(30)
);

insert into employee (empid,fname,lname) values (1, 'John','Doe'); --s 3

select * from employee; --s4 

update employee set dob='1999-01-01' where empid=1;  --s5

delete employee where empid=1;  --s6


---of these, 99% of the complexity comes from "select" statement.


--calculate stuff...

select age(dob), concat( substr(fname,1,1),substr(lname,1,1)) as initials 
from employee where empid=12;


-- filter stuff

select * from employee where dob >=cast('2000-01-01' as date); 


--- count stuff

select count(*) from employee;


select export(month from dob),count(*)   -- get count of birthdays per month
from employee
group by 1    ---some databases require this to be: export(month from dob)
order by 2 desc    --give me highest count first
;


---count ratios... as in... millenials to everyone

select 100.0*sum(case when dob>='2000-01-01' then 1.0 else 0.0 end)/sum(1.0) prcnt
from employee;

---count stddev/avg;; to find outliers.
---find employees with unusually high salaries (>4sd )

with stats as (   --using CTEs! 
  select avg(salary) a, stddev(salary) sd
  from employee 
)
select *
from employee cross join stats
where salary > a + 4*st

--- can also do this with windowing functions:

with stats as (
  select a.*, avg(salary) over()  av, stddev(salary) over() sd
  from employee a
)
select *
from stats
where salary > av + 4*st


---find employees with unusually high salaries (>4sd )
---within their department

with stats as (   --using CTEs! 
  select depid, avg(salary) av, stddev(salary) sd
  from employee 
  group by depid  --department id
)
select *
from employee a
   inner join stats b
   on a.depid=b.depdi
where salary > av + 4*st

--- can also do this with windowing functions:

with stats as (
  select a.*, avg(salary) over(partition by depid)  av, 
     stddev(salary) over(partition by depid) sd
  from employee a
)
select *
from stats
where salary > av + 4*st

----next topic: JOIN

--- grab employee name, and department name.
select a.fname,a.lname, b.name
from employee a
  inner join department b
  on a.depid=b.depid


--you could also do (but avoid in production)

select fname,lname,name
from employee
  natural inner join department 

--in fact, in production, always specify full list of columns you're using
--- never do: select * ...

--- select * might break if someone adds columns, removes columns, etc.
--- also, it is very diffuclt to profile database code to determine what 
--- columns are used

--- change requests: determine impact.
----- change datatype from int to bigint, or from bigint into varchar, etc.

---one way is to create a script that parses SQL, creates a mapping of 
---what columns are used by which project, if they're used in where clause,
--- in joins, calculations, etc., 
--- or pass-through (just in select), etc.
--- whenever a change request to "column" arrives, you can instantly know
--- which projects might need attention.


there are several join types....

inner join        <-- bulk of joins
left outer join   <-- very useful
full outer join    <-- very few practical uses
cross join    <-- don't use often

right outer join   <-- don't use, not supported  by some datases; change to left join (by swapping left/right tables).

natural inner join <-- dangerous to use in production environment
natural left outer join 
natural right outer join

--don't use "where" for joins... only use "where" for filtering.
e.g. don't do this:
select *
from employee a, department b
where a.depid=b.depid

---ambiguos way of doing left-join using this notation; "left outer join" syntax is cleaner


---eliminate dups:

select distinct fname,lname
from employee;

select fname,lname
from employee
group by fname,lname;


---there are subqueries... 

select *
from employee
where dob >= ALL (select max(dob) from employee)

---there are better ways of doing these, windowing and CTEs, etc.
---avoid sub-queries.


---set operations:

select fname,lname
from employee
union all
select fname,lname
from customer

---union, removes dups, union-all keeps dups.
---union all is faster

select fname,lname
from employee
minus     --- some database use "except"  
select fname,lname
from customer

-- some databases also have "intersect" 

---except for union all, most of these will implicitly sort the data (may be slow).

--set membership


select *
from employee
where depid in (1,2,3,5,8)

select * from employee where depid not in(6,7)

-------------

lets take a 10 min break, when we get back: windowing functions.

--- HW5: for days with splits AND dividends: apply splits first.
--- dividends are per share, so first determine number of shares (e.g. splits first).

--hw5hint:
drop table daily_prcnt;
create table daily_prcnt as
with lags as (
  select a.tdate,a.symbol,a.close,
   lag(a.close) over (partition by a.symbol order by a.tdate) pclose,
   coalesce(CAST(b.pre AS FLOAT), 1) as pre, coalesce(CAST(b.post AS FLOAT), 1) as post, coalesce(CAST(c.amount AS FLOAT), 0) as dividend
  from cts a
    left outer join splits b
    on a.tdate=b.tdate and a.symbol=b.symbol
    left outer join dividends c
    on a.tdate=c.tdate and a.symbol=c.symbol
 ),
 HW5 as (select a.*,
   100.0 *((close * post) / (pclose * pre - dividend) - 1) as prcnt
 from lags a
 where pclose>0 and (pclose * pre - dividend)>0
)
select a.tdate, a.symbol, prcnt
from hw5 a;

--hw6 hints: too much data, so... create a monthly_prcnt table.
create table monthly_prcnt as 
  select to_char(tdate,'YYYYMM') month, 
   exp( sum( log( ... )) ) prcnt
  from daily_prcnt 
  group by 1;


-------------------------------------------------------

windowing functions...

sum/avg/stddev/min/max

row_number(), 
rank()
dense_rank()
lead
lag 
last_value
first_value


--for every employee, find the next youngest employee in employee table...
---try do to this without windowing functions.

select ...
 lag(empid) over (order by dob) ...


---recursive queries...


find everyone who reports to bob.

with recursive emp(empid) as (
   select a.empid, 1 lvl     -- find everyone who reports directly to bob.
    from employee a 
      inner join employee b
      on a.managerid=b.employeeid
      where b.fname='bob'
   union all
   select a.empid, b.lvl+1 as lvl  --find everyone who reports to above (and recursively).
   from employee a
      inner join emp b
      on a.managerid = b.empid
)
select *
from emp;

-----
hw9 hint:

export ts=`date +%Y%m%d%H%M%S`
export db=fall2021

for t in dividends splits; do

  echo "select * from $t"|sqlrunner - |gzip -c | gpg -r jj@cocomelon.com -e -a > $t.$ts.csv.gz.enc

done