Home
Exercises
- Chapter 1
- Chapter 2
- Chapter 3
- Chapter 4
- Chapter 5
- Chapter 6
- Chapter 7
- Chapter 8
- Chapter 9
- Chapter 10
- Chapter 11
- Chapter 12
- Chapter 13
- Chapter 14
- Chapter 15
- Chapter 16
- Chapter 17
- Chapter 18
- Chapter 19
- Chapter 20
- Chapter 21
- Chapter 22
- Chapter 23
- Chapter 24
- Reference 1
- Reference 2
Skill builders
- Chapter 1
- Chapter 2
- Chapter 3
- Chapter 4
- Chapter 5
- Chapter 6
- Chapter 7
- Chapter 8
- Chapter 9
- Chapter 10
- Chapter 11
- Chapter 12
- Chapter 13
- Chapter 14
- Chapter 15
- Chapter 16
- Chapter 17
- Chapter 18
- Chapter 19
- Chapter 20
- Chapter 21
- Chapter 22
- Chapter 23
- Chapter 24
Slide exercises
- Chapter 1
- Chapter 2
- Chapter 3
- Chapter 4
- Chapter 5
- Chapter 6
- Chapter 7
- Chapter 8
- Chapter 9
- Chapter 10
- Chapter 11
- Chapter 12
- Chapter 13
- Chapter 14
- Chapter 15
- Chapter 16
- Chapter 17
- Chapter 18
- Chapter 19
- Chapter 20
- Chapter 21
- Chapter 22
- Chapter 23
- Chapter 24
Support
- Ask Rick
- CD case
- Classic Models
- Glossary
- Java
- Labs
- MySQL Workbench
- Slides
- SQL divide
- SQL Playbook
- Tables
- XML
Instructor
- Contact
- Manual

17 - Text Mining
Slide Exercises

Run the following code and comment on how sensitive sentiment analysis is to the n.before and n.after parameters

sample = c("You're not crazy and I love you very much.")
y <-  sentiment(sample, n.before = 4, n.after=2, amplifier.weight=1)
mean(y$sentiment)
y <-  sentiment(sample, n.before = Inf, n.after=Inf, amplifier.weight=1)
mean(y$sentiment)

The computed values for mean sentiment can vary considerably based on the choice of n.before and n.after. This might be more noticeable for a small text file than a larger one.

Create a corpus of Warren Buffet’s letters for the 2008-2010 letters.

require(stringr)
require(tm)
#set up a data frame to hold up to 5 letters 
df <-  data.frame(num=5)
begin <-  2008 # letters in the range 2008-2012 
i <-  begin
# read the letters
while (i < 2013) {   
	y <- as.character(i)
	# create the file name    
	f <- str_c('http://www.richardtwatson.com/BuffettLetters/',y, 'ltr.txt',sep='')
	# read the letter as on large string
	d <-  readChar(f,nchars=1e6)
	# add letter to the data frame
	df[i-begin+1,] <-  d   
	i <-  i + 1 
} 
# create the corpus
letters <-  Corpus(DataframeSource(as.data.frame(df), encoding = "UTF-8"))

What is the Flesch-Kincaid score for the 2010 letter?

require(koRpus)
#tokenize the first letter in the corpus
tagged.text <- tokenize(letters[[3]], format="obj",lang="en")
# score
readability(tagged.text, "Flesch.Kincaid", hyphen=NULL,force.lang="en")

Create a term-document matrix and find the words occurring more than 150 times in the letters for 2008-2102. Do appropriate preprocessing.

# convert to lower
clean.letters <-  tm_map(letters,tolower)
# remove punctuation
clean.letters <-  tm_map(clean.letters,removePunctuation)
# remove numbers
clean.letters <-  tm_map(clean.letters,removeNumbers)
# remove stop words 
clean.letters <-  tm_map(clean.letters,removeWords,stopwords('SMART'))
# strip extra white space
clean.letters <-  tm_map(clean.letters,stripWhitespace)
# stem the document -- takes a while to run
stem.letters <-  tm_map(clean.letters,stemDocument, language = "english")
# stem completion -- takes a while to run
stem.letters <-  tm_map(stem.letters,stemCompletion, dictionary=clean.letters)
# create term document matrix -- one row for each term and one column for each document
tdm <-  TermDocumentMatrix(clean.letters,control = list(minWordLength=3))
dim(tdm)
findFreqTerms(tdm, lowfreq = 150,highfreq = Inf)

Report the frequency of the 20 most frequent words. Do several runs to identify words that should be removed from the top 20 and remove them.

# Create a term document matrix
tdm <-  TermDocumentMatrix(stem.letters)
# convert term document matrix to a regular matrix to get frequencies of words
m <-  as.matrix(tdm)
# sort on frequency of terms to get frequencies of words
v <- sort(rowSums(m), decreasing=TRUE)
# display the 20 most frequent words
v[1:20]
# continue the process by removing words

Produce a word cloud for the words identified in the prior exercise.

library(wordcloud)
# select the color palette
pal = brewer.pal(5,"Accent")
# generate the cloud based on the 30 most frequent words
wordcloud(d$word, d$freq, min.freq=d$freq[30],colors=pal)

Select a word and compute its association with other words in the Buffett letters corpus. Adjust the correlation coefficient to get about 10 words

# compute the associations
findAssocs(tdm, "insurance",0.90)

Review the documentation of the hclust function in the stats package and try one or two other clustering techniques.

require(ggplot2)
require(ggdendro)
# setup the document term matrix
tdm <- TermDocumentMatrix(clean.letters)
# name the columns for the letter's year
colnames(tdm) <-  2008:2012
# Remove sparse terms
tdm1 <- removeSparseTerms(tdm, 0.5) 
# transpose the matrix
tdmtranspose <-  t(tdm1) 
cluster = hclust(dist(tdmtranspose),method='centroid')
# get the clustering data
dend <-  as.dendrogram(cluster) 
# plot the tree
ggdendrogram(dend,rotate=T)

This page is part of the promotional and support material for Data Management (open edition) by Richard T. Watson
For questions and comments please contact the author

17 - Text Mining Slide Exercises

Date revised: 17-Oct-2022

17 - Text Mining
Slide Exercises