basic sql: ddl drop table customer; create table customer(cid bigint, fname varchar(10), lname varchar(10) email varchar(100), city varchar(100), state varchar(2), dob date); create table purchase(pid bigint, cid bigint, tim timestamp); dml CRUD operations insert into customer(cid,fname,lname) values(1,'John','Doe'); select fname,lname from customer where cid=1; update customer set email='john.doe@gmail.com' where cid=1; delete from customer where cid=1; important one is: select CTAS: create table as... combines DDL and DML -- create a table identical to customer, with additional column -- for customer initials. create table custinitials as select a.*, concat(substr(fname,1,1),substr(lname,1,1)) ini from customer a; CTEs (common table expressions) --very useful for multi-step queries. --find the most popular initials... with initials as ( select a.*, concat(substr(fname,1,1),substr(lname,1,1)) ini from customer a ), cnt as ( select ini, count(*) c from initials group by ini ), maxcnt as ( select max(c) maxc from cnt ) select a.* from cnt a inner join maxcnt b on a.c = b.maxc; functions: applied on each record. substr(lname,1,1) returns first character. instr('@' in email) --returns location of '@' in an email. --location starts at 1... if returns 0, then not found. sin/cos/log/exp ... aggregates; applied on a group. select count(*) from customer; --if no "group" specified, assumes that entire dataset is a group. select state,count(*) from customer group by state; --then count is by state: for each value of state, we will have a count. select cid, count(*) from customer group by cid; ---what will this return? --assuming cid is primary key... this will return a count of 1 --for each customer. --what this means is that group by *can* explode. select state, city, count(*) from customer group state, city; ---newer databases allow for... easier syntax... select state, city, count(*) from customer group by 1,2 order by 1,2; ---that's group by first column (in select clause) followed by 2nd column in seelct clause.... --having clause... having applies a condition *per*group* ---similar to a where clause, but where applies *per*record*. select state, city, count(*) from customer where state in ('NY','NJ') group by state, city having count(*) > 10000 --windowing functions --assign a row number for each customer, in the order of their dob. --oldest customer gets a 1 select a.*, row_number() over (order by dob) rn from customer --find the oldest 10% of customers for each state... with prcnt as ( select a.*, 1.0*row_number() over (partition by state order by dob)/ count(*) over (partition by state) pr from customer ) select * from prcnt where pr <= 0.1; -- suppose we have 100 customers... row number with range from 1 to 100, -- count will be 100. for each record, we assign 1/100, 2/100, 3/100, etc. min/max/first_value/last_value/avg/sum/stddev/row_number/count --most aggregate functions have an analytical version. --- big topic: JOINS inner join matching records have to exist in both tables. trying to find customer fname, lname, and purchase timestamp. select a.fname, a.lname, b.tim from customer a inner join purchase b on a.cid = b.cid --if customer never made a purchase, we won't return them. --if there's a purchase by a non-existent customer, then we won't return that. left outer join --right outer join select a.fname, a.lname, b.tim from customer a left outer join purchase b on a.cid = b.cid -- if customer never purchased anything, then the tim will be NULL. this leands to an easy check for customers who have 0 purcahses: --- just add a where clause select a.* from customer a left outer join purchase b on a.cid = b.cid where b.cid is null -- <-- ensures no purcahses. full outer join --combination of left and right joins... very useful in comparing datasets --assume we aquired a company, and we want to merge customer datasets. --lets pretend we settle on fname,lname,dob as "key" for our comparison. select coalesce(a.fname,b.fname) fname, coalesce(a.lname,b.lname) lname, coalesce(a.dob, b.dob) dob, case when a.fname is null then 'NEW' when b.fname is null then 'OLD' else 'MERGE' end action, a.cid as old_cid, b.cid as new_cid from customer a --<-- our customer full outer join mergercustomer b --<-- new customers for us on a.fname=b.fname and a.lname=b.lname and a.dob=b.dob; cross join matches everything to every thing... ...in between joins... events(eid, tim, event_type) suppose we have event types: start, end, thing --we ONLY wish to find event THING that is between start and end. --if THING happens outsdie of start, end, we don't care. craete table start_end_tim as select a.*, lead(tim) over (order by tim) next_tim from events a where event_type in ('START','END'); select * from start_end_tim a inner join events b on b.tim between a.tim and a.next_tim --<-- in-between join. where a.event_type='START' and b.event_type='THING' ---this will work... ---how long will this run? what's the complexity? ---to fix this: we apply HW6 -- pretend that "tim" is already minutes-since-midnight. -- EXTRACT(EPOCH FROM TIMESTAMP) --seconds since 1970 --generate a record for every "minute" of the day. create table minutes_in_day as with digits as ( select 0 n union all select 1 n union all select 2 n union all select 3 n union all select 4 n union all select 5 n union all select 6 n union all select 7 n union all select 8 n union all select 9 n ), numbs as ( select a.n*1000+b.n*100+c.n*10+d.n as n from digits a cross join digits b cross join digits c cross join digits d ) select * from numbs where n <= 1440; create table as ranges_with_buckets as select a.*, b.n --<-- b.n is the bucket id. from start_end_tim a cross join minutes_in_day b where b.n between b.tim and b.next_tim; select * from ranges_with_buckets a inner join events b on b.n = b.tim and --<-- b.tim is minute since midnight. b.tim between a.tim and a.next_tim --<-- in-between join. where a.event_type='START' and b.event_type='THING' ---- joins rock! --------------------------------------------------------- --------------------------------------------------------- HADOOP, MAP-REDUCE, HDFS (HADOOP FILE SYSTEM), SPARK how is a single file stored across multiple machines? cut-into-pieces... somewhere you need to keep track of what pieces belong to which file... and in which order. also, need to keep track of which machine has which file blocks. to ensure redunduncy: constantly check (and correct) that each block is present on at least 3 machines. map-reduce... what's a map operation: a function that's applied on each record, without regard for other records. can be done in any order, even in parallel to the entire dataset. --how does map link up with reduce? --- via shuffle. -- use the hash(key) to determine which machine the record needs to be sent to. --once there, all records are sorted by key/value. --and reducer accepts that sorted list... (all keys are bunched up with their values---which is the input to the reducer). what's reduce operation? a function that takes a key and a list of values for that key, and performs some sort of function on that (often an aggregate). van also be performed in parallel and in any order from other reducers (acting on different keys). select state, city, count(*) from customer where state in ('NJ','NY') group by state, city having count(*) > 1000 ---how will this translate into map-reduce job? --map function will filter records based on state in NJ, NY. --output of map is: key=state,city, value=rest of record. (behind the scenes, shuffle will colocate records based on state,city) (behind the scenes, sort will organize records by state,city, rest-of-record.) --reducer will get: sequence of same-state-city records, until -- next state-city-record... the reducer will just count until state,city changes. -- then it will output state,city,count, and that's it. --In HIVE, this will be the actual "map-reduce" translation of the query. --in Spark, this will become a DAG (execution path) for the query. --spark will do something similar to map-reduce, except will it -- cache stuff in RAM instead of writing things back to HDFS. ---- with recursive n(n) as ( select 2 n union all select n+1 from n where n<1000 ) select a.n from n a left join n b on b.n < a.n group by a.n having a.n<=3 or min(a.n % b.n) > 0 n: 2,3,4,5,6,7,8,9,10,11,12,13,... 4%2 = 0... min = 0 5%2 >0 1, 5%3>0, 5%4>0 6%2 = 0, 7%2 > 0 8%2 = 0 output: 2, 3, 5, 7 returns(product, year, rate_of_return) ..if we invested 1000 into proudct ABC, in 2012, and sold it in 2022, ..how much would we have? select sum(rate_of_return)*1000 from returns where product='ABC' and year between 2012 and 2022. --can't sum rates of return... we need a product. if rate is: 0.07 <-- returns 7% we want to multiply 1000 * (1 + 0.07) if rate is 0% for say 2019... then this turns into: 1000 * (1+0.00) if rate is -2%, then it's actually 1000 * (1 - 0.02) select exp(sum(log(1 + rate_of_return))) * 1000.0 from returns where product='ABC' and year between 2012 and 2022. with recursive prod(n) as ( select (1+rate) as n from returns where year = 2012 union all select a.n*(1+b.rate) from prod a inner join returns b on b.year = a.year+1 where b.year <= 2022 ) select a.*, 1000 * n <-- this will return many records, -- n will be a compounded rate of return. from prod;