-- list all tables select * from dbc.tables; -- get ddl show table table_name; show view view_name; -- show current user select user; --get suggestion of collect statistics target diagnostic helpstats on for session; -- ctas create table schema.tablename_new as schema.tablename with data; -- timestamp string -> timestamp -> date select cast(cast('2016-03-07 23:59:59' as timestamp) as date) -- decimal -> date -> string select cast(cast(20170101 as char(8)) as date format 'YYYYMMDD') (format 'yyyy-mm-dd') (char(10)) --format sample select cast(cast(9999.9999999 as dec(18,10) format '--ZZ9.9999999999' ) as varchar(50)) select cast(cast(123456789.987654321 as dec(19,10) format '--Z(7)9.9(7)' ) as varchar(50)) select cast(cast(-123456789.987654321 as dec(19,10) format '--Z(7)9.9(7)' ) as varchar(50)) --oreplace select oreplace('hello warld', 'a', 'o'); --qualify select * from dbc.tables qualify row_number() over (order by databasename) between 1 and 10 --csum(cumulative sum) select version, csum(version, version) from dbc.tables --CSUM with reset logic (GROUP BY) select version, csum(version, version) from dbc.tables group by version --mavg(moving average) select version ,mavg(version, 3, version) from dbc.tables --mavg with reset select version ,mavg(version, 3, version) from dbc.tables group by 1 --msum(moving sum) select distinct version ,msum(version, 3, version) from dbc.tables --mdiff(moving difference) select version ,mdiff(version, 3, version) from dbc.tables --rank select version ,rank(version) from dbc.tables --dense rank select version ,dense_rank() over (order by version) from dbc.tables --quantile select distinct version ,quantile(10, version desc) from dbc.tables --lag and lead select calendar_date ,max(calendar_date) over(partition by 1 order by calendar_date rows between 1 preceding and 1 preceding) ,min(calendar_date) over(partition by 1 order by calendar_date rows between 1 following and 1 following) from SYS_CALENDAR.CALENDAR order by 1 -- merge merge into department using values (700, 'shipping', 990000.00) as dept(deptnum, deptname,budgamt) on dept.deptnum = department_number when matched then update set budget_amount = dept.budgamt when not matched then insert values (dept.deptnum, dept.deptname, dept.budgamt, null); -- with recursive with recursive all_trips (origin, destination, cost, depth) as ( select origin, destination, cost, 0 from flights where origin = 'lax' union all select all_trips.origin, flights.destination, all_trips.cost + flights.cost, all_trips.depth + 1 from all_trips inner join flights on all_trips.destination = flights.origin and all_trips.depth < 3 ) select * from all_trips;
-- drop table drop table if exists schema.table; -- create fact table create table schema.table ( id integer, ) distribute by hash(id) compress low; -- create dimension table create table schema.table ( id character varying, ) distribute by replication compress low; -- collect statistics analyze schema.table; -- * multiple columns to where-in isn't allowed select * from schema.t1 where (c1, c2) in (select d1, d2 from schema.t2); -- > error -- cast select cast(c1 as integer) from schema.table; select c1::integer from schema.table; -- collaborative filtering select * from cfilter ( on (select 3) partition by 3 inputtable ('public.input_table') outputtable ('public.output_table') inputcolumns ('c1') joincolumns ('j1') droptable ('true') ); -- canopy select * from canopy( on (select 1) partition by 1 inputtable('public.input_table') loosedistance('130.0') tightdistance('65.0')) order by canopyid ; -- kmeans modeling select * from kmeans (on (select 1) partition by 1 inputtable('public.input_table') outputtable('public.output_table') numberK('3')) ; -- kmeans scoring select * from kmeansplot ( on public.input_table partition by any on public.kmeans_modeling_output dimension centroidstable ('public.kmeans_modeling_output') ) order by 2,1
L05 - Multi-Version Concurrency Control
Append-Only Storage: 全ての更新は新しいタプルをテーブルの空き部分に作成し、参照先をその部分に変更する
Time-Travel Storage: Time-Travelテーブルに複製し、メインテーブルのバージョンを上書きする
Delta Storage: 変更された値のみをDeletaストレージセグメントに複製し、マスターバージョンを上書きする
PKey Indexesはバージョンチェインのヘッドを常に指す。Append-Only
Secondary Indexes: Logical Pointers(Primary Indexに対してPrimary Keyを持つ)とPhysical Pointersの2つに分けられる
Microsoft HEKATON: OLTP Engine for MSSQL started in 2008
L06 - Index Locking & Latching
Index Locks vs. Latches
INDEX: improves the speed of date retrieval. The additional costs are writes and storage spaces.
Order Preserving Indexes: Maintain keys in some sorted order. O(log n)
Hashing Indexes: Maps a hash of the key to particular record. Only supports equality predicates with O(1)
B-tree: Values in all nodes in the tree
B+tree: Only stores values in leaf nodes
Locks vs. Latches
Separate User transactions vs. Threads
Protect Database Contents vs. In-Memory Data Structures
During Entire Transactions vs. Critical Sections
Deadlock Detection & Resolution vs. Avoidance
Latch Implementions
Blocking OS Mutex std::mutex
Test-and-Set Spinlock(TAS) std::atomic<T>
Queue-based Spinlock (MCS) std::atomic<Latch*>
Reader-Writer Locks
Index Locking Schemes
Predicate Locks: Shared lock on SELECT-WHERE, Exclusive lock on UPDATE, INSERT, DELEET-WHERE. Hard to implement
Key-Value Locks: Locks that cover a single key value
Gap Locks
Key-Range Locks
Hierarchical Locking: Allow for a txn to hold wider key-range locks with differecnt locking modes
L07 Latch-free OLTP Indexes
Bw-Tree T-Trees: Based on AVL Trees. Stores pointers to their original values. Uses less memory. Difficult to rebalance, implement safe concurrent access
Skip Lists: Insert and Delete don’t require rebalancing
Index Implementation Issues
Reference Counting: Increment the counting before accessing, Decrement it when finished. Bad performance for multi core CPU
Read-Copy-Update
Epoch Garbage Collection
Null Suppression (Useful in wide tables with sparse data)
Run-length Encoding
Bitmap Encoding (Only practical if the value cardinality is low)
Delta Encoding
Incremental Encoding
Mostly Encoding
Dictionary Encoding
Multi-Attribute Encoding
(original)
val1 val2
A 202
B 101
A 202
C 101
B 101
(compressed)
val1+val2
XX
YY
XX
ZZ
YY
val1 val2 code
A 202 XX
B 101 YY
C 101 ZZ
Order-Preserving Compression
(original)
name
Andrea
Joy
Andy
Dana
(compressed)
name
10
40
20
30
value code
Andrea 10
Andy 20
Dana 30
Joy 40
select * from users where ename like ‘And’
vs select * from users name between 10 and 20
select * from users where ename like ‘And’
-> Still have to perform seq scan
select distinct * from users where ename like ‘And’
-> Only need to access dictionary
OLTP DBMS cannot use the OLAP compress techniques because we need to support fast random tuple access. Indexes consume a large portion of the memory fo an OLTP database.
L12 Logging Protocols
The gold standard for physical logging & recovery in a disk-oriented DBMS is ARIES Algorithms for Recovery and Isolation Exploting Semantics
Write-Ahead Logging
Repeating History During Redo
Logging Changes During Undo
SiloR uses the epoch-based OCC, it achieves high performance by parallelizing all aspects of logging, checkpointing, and recovery.
The logger threads write buffers out to files. After 100 epoches, it creates a new file
Log record format, Id of the txn that modified the record(TID), (Table, Key, Value)
update people set islame = true where name in ('Dana', 'Andy')
L13 Checkpointing Protocols
Checkpointing allows the systems to ignore large segments of the log to reduce recovery time.
Naïve Snapshots
Hyper - Fork Snapshots
Copy-On-Update Snapshot
VoltDB - Consistent Checkpoints
Wait-Free ZigZag
Wait-Free Ping-Pong
Facebook Scuba
By storing the database shared memory, the DBMS process can restart and the memory contents will survive
On shutdown, copy data from heap to shared memory
L14 Optimizer Implementation (Part 1)
Heuristic-Based Optimization
Define static rules that transform logical operators to a physical plan
Advantages
Easy to implement and debug
Works reasonably well and is fast for simple queries
Disadvantages
Relies on magic constants that predict the efficacy of a planning decision
Nearly impossible to generate good plans when operators have complex inter-dependencies
Heuristics + Cost-Based Join Search
Use static rules to perform initial optimization. Then use dynamic programming to determine the best join order for tables
Advantages
Usually finds a reasonable plan without having to perform an exhaustive search
Disadvantages
Left-deep join trees are not always optimal
Randomized Algorithms
Advantages
jumping around the search space randomly allows the optimizer to get out out of local minimums
Low memory overhead (if no history is kept)
Disadvantages
Difficult to determine why the DBMS may have chosen a particular plan
Have to do extra work to ensure that query plans are deterministic
Still have to implement correctness rules
Starburst Optimizer
Advantages
Works well in practice with fast performance
Disadvantages
Difficult to assign priorities to transformations
Rules maintenance is a huge pain
Volcano Optimizer
Advantages
Use declarative rules to generate transformations
Better extensibility with an efficient search engine
Disadvantages
All equivalence classes are completely expanded to generate all possible logical operators before the optimization search
Stratified Search First rewrite the logical query plan using transformation rules and then perform a cost-based search to map the logical plan to physical plan
Unified Search Unify the notion of both logical→logical and logical→physical transformations
Cascade Optimizer Object oriented implementation of the Volcano query optimzer
Search Termination
Wall-clock Time
Cost Threshold
Transformation Exhaustion
L16 Cost Models
Cost model components are Physical costs, Logical costs and Algorithmic costs.
Postgres cost model uses a combination of CPU and I/O costs.
In memory is 400x faster then disk
Sequntial I/O is 4x faster than random I/O
IBM DB2 Learning Optimizer updates table stats as the DBMS scans a table during normal query processing.
Push: A centralized dispatcher assigns tasks to workers and monitors their progress
Pull: Workers pull the next task from a queue, process it, and then return to get the next task
L18 Parallel Join Algorithms (Hashing)
Many OLTP DBMSs don't implement hash join.
Hash join is the most important operator in a DBMS for OLAP workloads.
Build phase
For each tuple, hash the join key attribute for that tuple and add it to the appropriate bucket in the hash table.
Probe phase
For each tuple in S, hash its join key and check to see whether there is a match for each tuple in corresponding bucket in the hash table constructed for R.
Hash functions
* MurmurHash (2008)
* Google CityHash (2011)
* Google FarmHash (2014)
* CLHash (2016)
Hash table implementations
* Chained hash table: maintain a linked list of buckets for each slow in the hash table.
* Open-Addressing hash table: Single giant table of slots. Resolve collisions by linearly searching for the next free slot in the table.
* Cuckoo hash table: Use multiple hash tables with different hash functions.
Moving data between DRAM and GPU is slow over PCI-E bus
Sort-Merge Join
Phase #1 Sort: Sort the tuples of R and S based on the join key
Phase #2 Merge: The outer relation R only needs to be scanned once
Sorting is always the most expensive part
Parallel Sort-Merge Join
Phase #1 Partitioning (optional): Partition R and assign them to workers / cores
Phase #2 Sort
Phase #3 Merge
Partitioning phase
* Divide the relations into chunks and assign them to cores
Sort phase
* Create runs of sorted chunks of tuples for both input replations
* It used to be that Quicksort was good enough
* Sorting networks is similar to ghost leg
Merge phase
* Iterate through the outer table and inner table in lockstep and compare join keys
L20 Query Compilation
Code generation
* Transpilation
* JIT Compilation: generate IR(intermediate representation) of the query
L21 Vectorized Execution (Part I)
Mutli-Cores vs MIC(Many Integrated Cores)
Streaming SIMD Extensions (SSE) is a collection SIMD instructions that target special 128-bit SIMD registers
The restrict keyword in C++ tells the compiler that the arrays are distinct locations in memory
Fundamental elements of circutis
1. capacitor
2. Registor
3, Inductor
4. Memristor
Phase-change memory (PRM)
* A short pulse changes the cell to a '0'
* A long, gradual pulse changes the cell to a '1'
Resistive RAM (ReRAM)
Running a current one direction moves electrons from the top TiO2 layter to the botttom, thereby changing the resistance
Magnetoresistive RAM (MRAM)
Stores data using magnetic storage elements instead of electric charge or current flows
NVM for database systems
Block-addressable NVM is not that interesting.
Byte-addressable NVM will be a game changer but will require some work to use correctly
NVM-Aware memory allocator
#1 synchronization
➝ The allocator writes back CPU cache lines to NVM using the CLFLUSH instruction
➝ It then issues a SFENCE instruction to wait for the data to become durable on NVM
#2 Naming
➝ The allocator ensures that virtual memory addresses assigned to a memory-mapped region never change even after the OS or DBMS restarts.