Banner

 

16 - Data Visualization

Answers to exercises

1.
Given the following data on the world’s cargo carrying capacity in millions of dead weight tons (dwt) for the given years, graph the relationship as a point plot. Add a linear prediction model to the graph. What is the approximate increase in the world capacity each year?
dwt <-  c(2566, 3704, 4008, 5984, 7700, 8034, 8229, 7858, 8408, 8939)
year <- c(1970, 1980, 1990, 2000, 2006, 2007, 2008, 2009, 2010, 2011)
library(ggplot2)
dwt <- c(2566, 3704, 4008, 5984, 7700, 8034, 8229, 7858, 8408, 8939)
year <- c(1970, 1980, 1990, 2000, 2006, 2007, 2008, 2009, 2010, 2011)
df1 <- data.frame(year, dwt)
# ggplot
ggplot(df1,aes(year, dwt)) +
geom_point(color = 'red') +
xlab('Year') +
ylab('Millions of dead weight tons') +
ylim(0,10000)
2.
Create a graph showing the relative ocean shipping cost as a percentage of total cost for some common household items using the following data.
unitCost <-  c(700, 200, 150, 50, 15, 3, 1)
shipCost <-  c(10.00, 1.50, 1.00, 0.15, 0.15, 0.05, 0.01)
item <-  c('TV set', 'DVD player', 'Vacuum cleaner', 'Scotch whisky', 'Coffee', 'Biscuits', 'Beer')
library(ggplot2)
unitCost <- c(700, 200, 150, 50, 15, 3, 1)
shipCost <- c(10.00, 1.50, 1.00, 0.15, 0.15, 0.05, 0.01)
diff <- shipCost/unitCost*100
item <- c('TV set', 'DVD player', 'Vacuum cleaner', 'Scotch whisky', 'Coffee', 'Biscuits', 'Beer')
df2 <- data.frame(unitCost, shipCost,diff, item)
# ggplot
ggplot(df2,aes(item, diff)) +
geom_col(fill = 'bisque') +
xlab('Item') +
ylab('Relative cost of shipping (%)')
3.
Visualize in blue the number of items for each product scale.
library(ggplot2)
library(DBI)
library(RMySQL)
# connect to the database
conn <- dbConnect(RMySQL::MySQL(),"www.richardtwatson.com", dbname = "ClassicModels", user = "student", password = "student")
# Query the database and create file for use with R
d <- dbGetQuery(conn,"SELECT productScale from Products;")
# Plot the number of product lines by specifying the appropriate column name
# Internal fill color is blue
# ggplot
ggplot(d,aes(productScale)) +
geom_bar(fill = 'blue') +
xlab('Product scale') +
ylab('Frequency')
4.
Prepare a line plot with appropriate labels for total payments for each month in 2004.
library(ggplot2)
library(DBI)
library(RMySQL)
# connect to the database
conn <- dbConnect(RMySQL::MySQL(),"www.richardtwatson.com", dbname = "ClassicModels", user = "student", password = "student")
# Query the database and create file for use with R
d <- dbGetQuery(conn,"SELECT MONTH(paymentDate) as month, SUM(amount) AS monthTotal FROM Payments WHERE YEAR(paymentDate)= 2004 GROUP BY MONTH(paymentDate);") # ggplot
ggplot(d,aes(month, monthTotal)) +
geom_line(color = 'green') +
xlab('Month') +
ylab('Total payments')
5.
Create a histogram with appropriate labels for the value of orders received from the Nordic countries (Denmark, Finland, Norway, Sweden).
library(ggplot2)
library(DBI)
library(RMySQL)
# connect to the database
conn <- dbConnect(RMySQL::MySQL(),"www.richardtwatson.com", dbname = "ClassicModels", user = "student", password = "student")
# Query the database and create file for use with R
d <- dbGetQuery(conn,"SELECT country, sum(quantityOrdered*priceEach) AS orders FROM Orders, OrderDetails, Customers WHERE Orders.orderNumber = OrderDetails.orderNumber and Customers.customerNumber = Orders.customerNumber AND country IN ('Denmark','Finland', 'Norway','Sweden') GROUP BY countr
# ggplot
ggplot(d,aes(country, orders)) +
geom_col(fill = 'yellow') +
xlab('Country') +
ylab('Orders')
6.
Create a heatmap for product lines and Norwegian cities.
library(ggplot2)
library(DBI)
library(RMySQL)
# connect to the database
conn <- dbConnect(RMySQL::MySQL(),"www.richardtwatson.com", dbname = "ClassicModels", user = "student", password = "student")
# Query the database and create file for use with R
d <- dbGetQuery(conn,"SELECT count(*) as frequency, productLine, city
FROM Products, OrderDetails, Orders, Customers
WHERE Products.productCode = OrderDetails.productCode
AND OrderDetails.orderNumber = Orders.orderNumber
AND Orders.customerNumber = Customers.customerNumber
AND country = 'Norway'
GROUP BY productLine, city")
# ggplot
ggplot(d,aes(productLine, city)) +
geom_tile(aes(fill=frequency)) +
scale_fill_gradient(low=" light blue", high="dark blue") +
xlab('Line') +
ylab('Scale')
7.
Show on a Google map the customers in Japan.
library(ggplot2)
library(DBI)
library(RMySQL)
library(ggmap)
# connect to the database
conn <- dbConnect(RMySQL::MySQL(),"www.richardtwatson.com", dbname = "ClassicModels", user = "student", password = "student")
# Query the database and create file for use with R
d <- dbGetQuery(conn,"SELECT y(officeLocation) AS lon, x(officeLocation) AS lat FROM Offices;")
map <-  get_googlemap('tokyo, japan',marker=d,zoom=5)
ggmap(map) + labs(x = 'Longitude', y = 'Latitude') + ggtitle('Japanese offices')
8.
Show on a Google map the European customers who have never placed an order.
library(ggplot2)
library(DBI)
library(RMySQL)
library(ggmap)
# connect to the database
conn <- dbConnect(RMySQL::MySQL(),"www.richardtwatson.com", dbname = "ClassicModels", user = "student", password = "student")
# Query the database and create file for use with R
d <- dbGetQuery(conn,"SELECT y(customerLocation) AS lon, x(customerLocation) AS lat 
FROM Customers WHERE customerNumber NOT IN (SELECT customerNumber FROM Orders);")
map <-  get_googlemap('europe',marker=d,zoom=4)
ggmap(map) + labs(x = 'Longitude', y = 'Latitude') + ggtitle('European customers without orders')
9.
Access http://www.richardtwatson.com/data/manheim.txt, which contains details of the sales of three car models: X, Y, and Z.
9a.
Create a bar chart for sales of each model (X, Y, or Z)
library(ggplot2)
library(readr)
url <- 'http://www.richardtwatson.com/data/manheim.txt'
t <- read_delim(url, delim=',')
# ggplot
ggplot(t,aes(model)) +
geom_bar(fill = 'lightgreen')
9b.
Create bar chart for sales by each form of sale (online or auction)
library(ggplot2)
library(readr)
url <- 'http://www.richardtwatson.com/data/manheim.txt'
t <- read_delim(url, delim=',')
# ggplot
ggplot(t,aes(sale)) +
geom_bar(fill = 'violet')
10.
Use the 'Import Dataset' feature of RStudio to read http://www.richardtwatson.com/data/electricityprices2010_14.csv, which contains details of electricity prices for a major city. Do a Box plot of cost. What do you conclude about cost?
library(ggplot2)
library(readr)
# read with Import Dataset
url <- 'http://www.richardtwatson.com/data/electricityprices.csv'
t <- read_delim (url, delim = ',')
# ggplot
ggplot(t,aes(factor(0),cost)) +
geom_boxplot(outlier.colour='red') +
xlab("Hourly measures") +
ylab("Cost")
11.
Read the table http://www.richardtwatson.com/data/wealth.csv containing details of the wealth of various countries. Create histograms for each of the wealth measures. Use the list of valid colors to give a bit of color to your graphics.
library(ggplot2)
library(readr)
url <-  'http://www.richardtwatson.com/data/wealth.csv'
t <-  read_delim(url, delim=',')
# ggplot
ggplot(t, aes(`GDP per capita`)) +
     geom_histogram(fill='firebrick')
ggplot(t, aes(`GDP per capita`)) +
     geom_histogram(fill='darkorange')
ggplot(t, aes(`GDP per capita`)) +
     geom_histogram(fill='darkolivegreen')
12.

Merge the weather and electricity price data for a major city. The merged file should contain air temperature and electricity cost. Also, you need to convert air temperature from a factor to a numeric (hint: first convert it to a character).

library(readr)
library(dplyr)
wurl <- 'http://www.richardtwatson.com/data/weather.csv'
w <- read_delim(wurl,delim = ',')
eurl <- 'http://www.richardtwatson.com/data/electricityprices.csv'
e <- read_delim(eurl,delim = ',')
m <- inner_join(e,w, by = c('timestamp' = 'Timestamp'))
m$airtemp <- as.numeric(as.character(m$Air.Temp..F.))
head(m)
12a.

Compute the correlation between temperature and electricity price. What do you conclude?

cor.test(m$airtemp,m$cost)
Conclude there is a significant relationship between temperature and cost of electricity (p < .05)
12b.
Graph the relationship between temperature and electricity price.
library(ggplot2)
m$airtemp <- as.numeric(m$Air.Temp..F.)
# ggplot
ggplot(m, aes(airtemp, cost)) +
geom_point(color ='FireBrick') +
xlab('Air temperature') +
ylab('Electricity cost')
12c.
Graph the relationship between temperature and electricity price when the temperature is 95ºF and above.
library(ggplot2)
library(dplyr)
m1 <- m %>% filter(airtemp >= 95)
# ggplot
ggplot(m1,aes(airtemp, cost)) +
geom_point(color ='FireBrick') +
xlab('Air temperature') +
ylab('Electricity cost')
12d.
Create a single graph showing the relationship between temperature and electricity price differentiating by color when the temperature is above or below 90ºF. (Hint: Trying recoding temperature).
library(ggplot2)
library(dplyr)
m$Cut <- 'Not so hot'
m$Cut[m$airtemp >= 90] <- 'Hot'
# ggplot
ggplot(m,aes(airtemp, cost, group = Cut)) +
geom_point(aes(color = Cut)) +
xlab('Air temperature') +
ylab('Electricity cost')
This page is part of the promotional and support material for Data Management (open edition) by Richard T. Watson
For questions and comments please contact the author
Date revised: 10-Dec-2021