Banner

 

17 - HDFS & MapReduce

Answers to exercises

1.
Write a MapReduce function for the following situations. In each case, write the corresponding R code to understand how MapReduce and conventional programming differ.
1a.
Compute the square and cube of the numbers in the range 1 to 25. Display the results in a data frame.
# R
# create a list of 25 integers
ints <- 1:25
result <- sapply(ints,function(x) x^2)
result
result <- sapply(ints,function(x) x^3)
result

# MapReduce
require(rmr2)
rmr.options(backend = "local") # local or hadoop
# load a list of 25 integers into HDFS 
hdfs.ints = to.dfs(1:25)
# mapper for the key-value pairs to compute squares
mapper <- function(k,v) {
  key <- v
  value <- c(key^2,key^3)
  keyval(key,value)
}
# run MapReduce 
out = mapreduce(input = hdfs.ints, map = mapper)
# convert to a data frame
df = as.data.frame(from.dfs(out))
# reshape with n, n^2, n^3 one row
#add identifiers for each row as they are consecutively the square and cube
df$powers <- c('n^2','n^3')
output <- cast(df,key ~ powers,value="val")
head(output)
1c.
Using the average monthly temperatures for New York's Central Park, compute the max, mean, and min for August.
# R
library(readr)
url <-  "http://people.terry.uga.edu/rwatson/data/centralparktemps.txt"
t <- read_delim(url, delim = ',')
t8 = t[t$month==8,]
max(t8$temperature)
mean(t8$temperature)
min(t8$temperature) # MapReduce
library(rmr2)
rmr.options(backend = "local") # local or hadoop
library(reshape)
library(readr)
url <- "http://people.terry.uga.edu/rwatson/data/centralparktemps.txt"
t <- read_delim(url, delim = ',')# save temperature in hdfs file
# subest before passing to mapper
hdfs.temp <- to.dfs(data.frame(t))
# mapper for computing temperature measures
mapper <- function(k,v) {
keyval("August",subset(v$temperature,v$month==8))
}
#reducer to report stats
reducer <- function(k,v) {
key <- k # month
value <- c(length(v),max(v),round(mean(v),1),min(v)) #v is list of values
keyval(key,value)
}
out = mapreduce(
input = hdfs.temp,
map = mapper,
reduce = reducer)
# convert to a frame
df = as.data.frame(from.dfs(out))
# add measure identifiers
df$measure <- c('observations','max','mean','min')
df

This page is part of the promotional and support material for Data Management (open edition) by Richard T. Watson
For questions and comments please contact the author

Date revised: 02-Dec-2022