4 SQL
4.1 General
https://www.w3schools.com/sql/default.asp ‘Getting started with SQL’ using SQLite. Download database found in link by vlicking the ZIP button and copy the contents to a folder of choice. The navigate in the top menu to Database -> add a database. https://github.com/thomasnield/oreilly_getting_started_with_sql
4.1.1 SELECT
SELECT * FROM CUSTOMER;
SELECT CUSTOMER_ID, NAME FROM CUSTOMER;
# Generate a calculated column
SELECT
PRODUCT_ID,
DESCRIPTION,
PRICE AS UNTAXED_PRICE,
round(PRICE * 1.07,2) AS TAXED_PRICE
FROM PRODUCT;
# Text concatenation. Concatenation works with any data type.
SELECT NAME,
CITY || ', ' || STATE AS LOCATION
FROM CUSTOMER;
4.1.2 WHERE
SELECT * FROM STATION_DATA
WHERE year = 2010;
#Use != or <> to get everything but 2010
SELECT * FROM STATION_DATA
WHERE year != 2010;
#AND, OR, IN statements
SELECT * FROM STATION_DATA
WHERE year >= 2005 AND year <= 2010;
#in
SELECT * FROM STATION_DATA
WHERE MONTH IN (3,6,9,12)
#not in
SELECT * FROM STATION_DATA
WHERE MONTH NOT IN (3,6,9,12)
#modulus operator
#modulus returns the remainder instead of the quotient. A remainder of 0 means there is no remainder at all
SELECT * FROM STATION_DATA
WHERE MONTH % 3 = 0
#using where on text
SELECT * FROM STATION_DATA
WHERE report_code in ('513A63', '1F8A7B')
SELECT * FROM STATION_DATA
WHERE length(report_code) = 6
#wildcards
SELECT * FROM STATION_DATA
WHERE report_code LIKE 'A%'
#B as the first, C as the third letter
SELECT * FROM STATION_DATA
WHERE report_code LIKE 'B_C%'
#Other text functions as INSTR, SUBSTR, REPLACE ETC
SELECT * FROM station_data
WHERE snow_depth IS NULL;
#Use coalesce to conert NULL to 0, "N/A" etc
SELECT * FROM station_data
WHERE coalesce(precipitation,0) <= 0.5;
SELECT * FROM station_data
WHERE (rain = 1 AND temperature <= 32)
OR snow_depth > 0;
4.1.3 GROUP BY
WHERE filters individual records while HAVING filters aggregations.
SELECT year, month, COUNT(*) AS record_count FROM station_data
WHERE tornado = 1
GROUP BY year, month
ORDER BY year, month
#aggregate
SELECT month, AVG(temperature) as avg_temp
FROM station_data
WHERE year >= 2000
GROUP BY month
#HAVING statement
#To be able to filter on aggregate data you need to use HAVING instead of WHERE
SELECT year,
SUM(precipitation) as total_precipitation
FROM station_data
GROUP BY year
HAVING sum(precipitation) > 30
4.1.4 CASE statements
CASE statements are read from top to bottom, so the first true condition is the one it uses. A great benefit of CASE statements compared to WHERE is that you can apply different filters for different aggregate values.
SELECT report_code, year, month, day, wind_speed,
CASE
WHEN wind_speed >= 40 THEN 'HIGH'
WHEN wind_speed >= 30 THEN 'MODERATE'
ELSE 'LOW'
END as wind_severity
FROM station_data
# Use of CASE to apply different filters
SELECT year, month,
SUM(CASE WHEN tornado = 1 THEN precipitation ELSE 0 END) as tornado_precipitation,
SUM(CASE WHEN tornado = 0 THEN precipitation ELSE 0 END) as non_tornado_precipitation
FROM station_data
GROUP BY year, month
# Use of boolean expression
SELECT month,
AVG(CASE WHEN rain OR hail THEN temperature ELSE null END) AS avg_precipitation_temp,
AVG(CASE WHEN NOT (rain OR hail) THEN temperature ELSE null END) AS avg_non_precipitation_temp
FROM station_data
WHERE year > 2000
GROUP BY month
4.1.5 JOIN
In multiple joins, it may be erroundous to mix LEFT JOIN with INNER JOIN. This is becasue null values cannot be inner joined on and will get filtered out. LEFT JOIN tolerates null values.
#INNER JOIN
SELECT ORDER_ID,
CUSTOMER.CUSTOMER_ID,
ORDER_DATE,
ORDER_QTY
FROM CUSTOMER INNER JOIN CUSTOMER_ORDER
ON CUSTOMER.CUSTOMER_ID = CUSTOMER_ORDER.CUSTOMER_ID
#LEFT JOIN
SELECT ORDER_ID,
CUSTOMER.CUSTOMER_ID,
ORDER_DATE,
ORDER_QTY
FROM CUSTOMER LEFT JOIN CUSTOMER_ORDER
ON CUSTOMER.CUSTOMER_ID = CUSTOMER_ORDER.CUSTOMER_ID
#Checking for NULL values
SELECT CUSTOMER.CUSTOMER_ID,
NAME AS CUSTOMER_NAME
FROM CUSTOMER LEFT JOIN CUSTOMER_ORDER
ON CUSTOMER.CUSTOMER_ID = CUSTOMER_ORDER.CUSTOMER_ID
WHERE ORDER_ID IS NULL
#Multiple joins
SELECT
ORDER_ID,
CUSTOMER.CUSTOMER_ID,
'NAME AS CUSTOMER.NAME,'
STREET_ADDRESS,
CITY,
STATE,
ZIP,
ORDER_DATE,
PRODUCT_ID,
DESCRIPTION,
ORDER_QTY
FROM CUSTOMER
INNER JOIN
CUSTOMER_ORDER ON CUSTOMER_ORDER.CUSTOMER_ID = CUSTOMER.CUSTOMER_ID
INNER JOIN
PRODUCT ON CUSTOMER_ORDER.PRODUCT_ID = PRODUCT.PRODUCT_ID
#Use coalesce() to turn nulls into zeros.
SELECT
CUSTOMER.CUSTOMER_ID,
NAME AS CUSTOMER_NAME,
coalesce(sum(ORDER_QTY * PRICE), 0) as TOTAL_REVENUE
FROM CUSTOMER
LEFT JOIN CUSTOMER_ORDER
ON CUSTOMER.CUSTOMER_ID = CUSTOMER_ORDER.CUSTOMER_ID
LEFT JOIN PRODUCT
ON CUSTOMER_ORDER.PRODUCT_ID = PRODUCT.PRODUCT_ID
GROUP BY 1,2
4.2 Setting up a database
Always strive to gave a primary key on every table, which provides a unique identify to each record. Foreign key in a child table points to the primary key in its parent table. A database schema is a diagram showing tables, their columns, and their relationships.
4.2.1 Creating tables
#Autoincrement allows a unique number to be generated automatically when a new record is inserted into a table. Not needed for SQLite, but for MySQL and some other platforms.
#Allowing fields to be NULL can be done for fields that are not mandatory, while using NOT NULL should be done for fields that needs to be populated.
CREATE TABLE COMPANY (
COMPANY_ID INTEGER PRIMARY KEY AUTOINCREMENT,
NAME VARCHAR(30) NOT NULL,
DESCRIPTION VARCHAR(60),
PRIMARY_CONTACT_ID INTEGER NOT NULL
);
CREATE TABLE ROOM (
ROOM_ID INTEGER PRIMARY KEY AUTOINCREMENT,
FLOOR_NUMBE INTEGER NOT NULL,
SEAT_CAPASITY INTEGER NOT NULL
);
CREATE TABLE PRESENTATION (
PRESENTATION_ID INTEGER PRIMARY KEY AUTOINCREMENT,
BOOKED_COMPANY_ID INTEGER NOT NULL,
BOOK_ROOM_ID INTEGER NOT NULL,
START_TIME TIME,
END_TIME TIME
);
CREATE TABLE ATTENDEE (
ATTENDEE_ID INTEGER PRIMARY KEY AUTOINCREMENT,
FIRST_NAME VARCHAR (30) NOT NULL,
LAST_NAME VARCHAR (30) NOT NULL,
PHONE INTEGER,
EMAIL VARCHAR (30),
VIP BOOLEAN DEFAULT(0) 'BOOLEAN MAKE DEFAULT FALSE (0)'
);
CREATE TABLE PRESENTAION_ATTENDANCE (
TICKET_id INTEGER PRIMARY KEY AUTOINCREMENT,
PRESENTATION_ID INTEGER,
ATTENDEE_ID INTEGER
);
#Foreign keys in a child table should be tied to primary key of a parent table. Setting up foreign keys by double clicking on the varible to modify and link this to a primary key of a parent table.
4.2.2 Creating views
Creating views stores querys in the database so it can easily be called upon.
CREATE VIEW PRESENTATION_VW AS
SELECT COMPANY.NAME AS BOOKED_COMPANY,
ROOM.ROOM_ID AS ROOM_NUMBER,
ROOM.FLOOR_NUMBE AS FLOOR,
ROOM.SEAT_CAPASITY AS SEATS,
START_TIME,
END_TIME
FROM PRESENTATION
INNER JOIN
COMPANY ON PRESENTATION.BOOKED_COMPANY_ID = COMPANY.COMPANY_ID
INNER JOIN
ROOM ON PRESENTATION.BOOK_ROOM_ID = ROOM.ROOM_ID;
# Queryn from a view
SELECT * FROM PRESENTATION_VW
WHERE SEAT_CAPACITY >= 30
4.2.3 Populating database
4.2.3.1 INSERT
INSERT INTO ATTENDEE (FIRST_NAME, LAST_NAME)
VALUES ('THOMAS', 'NIELD')
SELECT * FROM ATTENDEE
#Multiple inserts. Can be used with Python to populate large amounts of data.
INSERT INTO ATTENDEE (FIRST_NAME, LAST_NAME, PHONE, EMAIL, VIP)
VALUES
('Jon', 'Skeeter', 3525253523, 'john.skeeter@rex.net', 1),
('Sam', 'Jones', 25235235235, 'sam.jones@hej.se', 0),
('Karl', 'Andersson', 5235235, 'karl.andersson@gmail.com',1)
#Inserting with a SELECT from another table
INSERT INTO ATTENDEE (FIRST_NAME, LAST_NAME, PHONE, EMAIL)
SELECT FIRST_NAME, LAST_NAME, PHONE, EMAIL
FROM ANOTHER_TABLE
4.2.3.2 DELETE
Combining DELETE with WHERE. It could be good to replace DELETE with SELECT to get a preview what will be deleted.
4.2.3.3 UPDATE
UPDATE modifies existing records.
4.2.4 Connecting to database from R
Connecting to the database from R.
#https://db.rstudio.com/dplyr/
library(dplyr)
library(DBI)
library(RSQLite)
con <- dbConnect(SQLite(), "C:\\Users\\chris\\Documents\\oreilly_getting_started_with_sql-master\\surgetech_conferencev2.db")
#Show list of table
as.data.frame(dbListTables(con))
#get data in data frame with sql syntax
df <- data.frame(tbl(con, sql("SELECT * FROM ATTENDEE")))
#with dplyr syntax
df2 <- data.frame(tbl(con, "ATTENDEE"))
# When data is fetched, disconnect to db.
dbDisconnect(con)
4.3 Useful expressions
Operator | Description | Example |
---|---|---|
abs() | Calculates the absolute value | abs(x) |
coalesce() | Converts a possible null value into a default | coalesce(z,y) |
instr() | Checks if a text string contains another text string | instr(‘HTX’, ‘TX’) |
length() | Provides the number of characters in a string | length(‘Test’) |
trim() | Removes extraneous on both sides of a string | trim(’ Test ’) |
ltrim() | Removes extraneous on the left side of a string | ltrim(’ Test’) |
rtrim() | Removes extraneous on the right side of a string | rtrim(‘Test’) |
random() | Returns a random number | random() |
round() | Rounds a decimal to specified number | round(100.101, 2) |
replace() | Replaces a substring of text in a string | replace(‘Test df’, ’ df’, ’’) |
substr() | Extracts a range of characters from a string with their num position | substr(‘Test’, 2,3) |
lower() | Turns all letters in a string to lowercase | lower(‘Test’) |
upper() | Turns all letters in a string to uppercase | upper(‘Test’) |
4.4 SQL databases and R
4.4.1 Connecting to a database from R
Access a database from R. The problem with dplyr is that all operations are conducted in-memory and thus the amount of data you can work with is limited by available memory. The database connection essentially removes that limitation. Accessing a temp database by downloading it. Dplyr and dbplyr are used in R to point to the database.
#https://datacarpentry.org/R-ecology-lesson/05-r-and-databases.html
library("RSQLite")
library("plyr")
library("dbplyr")
# Downloading database for test
dir.create("data_raw", showWarnings = FALSE)
download.file(url = "https://ndownloader.figshare.com/files/2292171",
destfile = "data_raw/portal_mammals.sqlite", mode = "wb")
# Connect to databse
mammals <- DBI::dbConnect(RSQLite::SQLite(), "data_raw/portal_mammals.sqlite")
# Querying with SQL-syntax vs dplyr syntax
#sql
tbl(mammals, sql("SELECT year, species_id, plot_id FROM surveys"))
#dplyr
surveys <- tbl(mammals, "surveys")
surveys %>%
select(year, species_id, plot_id)
#Checking head(surveys, n = 10) and nrow(surveys) we see that the surveys at first glance looks like a data frame but there are some differences.*
head(surveys, n = 10)
nrow(surveys)
#The reason for this is that dplyr dosen´t see the full dataset, only what was asked for when the question in dplyr was translated into SQL.
4.4.2 Running SQL syntax in R
library(sqldf)
#https://dept.stat.lsa.umich.edu/~jerrick/courses/stat701/notes/sql.html#introduction
sqldf('SELECT age, circumference FROM Orange WHERE Tree = 1 ORDER BY circumference ASC')
sqldf("SELECT * FROM iris")
#example
data(BOD)
BOD
#Wildcard: used to extract everything
bod2 <- sqldf('SELECT * FROM BOD')
bod2
#LIMIT controls the number of results
sqldf('SELECT * FROM iris LIMIT 5')
#ORDER BY syntax: ORDER BY var1 {ASC/DESC}, var2 {ASC/DESC}
sqldf("SELECT * FROM Orange ORDER BY age ASC, circumference DESC LIMIT 5")
#Where can be used to add conditional statements
sqldf('SELECT demand FROM BOD WHERE Time < 3')
#WHERE with AND and OR
sqldf('SELECT * FROM rock WHERE (peri > 5000 AND shape < .05) OR perm > 1000')
#IN is used to similiar to %in%
sqldf('SELECT * FROM BOD WHERE Time IN (1,7)')
sqldf('SELECT * FROM BOD WHERE Time NOT IN (1,7)')
#LIKE weak expression command
sqldf('SELECT * FROM chickwts WHERE feed LIKE "%bean" LIMIT 5')
sqldf('SELECT * FROM chickwts WHERE feed NOT LIKE "%bean" LIMIT 5')
#Aggregated data: AVG, MEDIAN, MAX, MIN, SUM
sqldf("SELECT AVG(circumference) FROM Orange")
#SELECT COUNT
d <- data.frame(a = c(1,1,1), b = c(1,NA,NA))
d
sqldf("SELECT COUNT() as numrows FROM d")
sqldf("SELECT COUNT(b) FROM d")