Banner

 

17 - Text Mining
Slide Exercises

Run the following code and comment on how sensitive sentiment analysis is to the n.before and n.after parameters
sample = c("You're not crazy and I love you very much.")
y <-  sentiment(sample, n.before = 4, n.after=2, amplifier.weight=1)
mean(y$sentiment)
y <- sentiment(sample, n.before = Inf, n.after=Inf, amplifier.weight=1)
mean(y$sentiment)
The computed values for mean sentiment can vary considerably based on the choice of n.before and n.after. This might be more noticeable for a small text file than a larger one.

Create a corpus of Warren Buffet’s letters for the 2008-2010 letters.

require(stringr)
require(tm)
#set up a data frame to hold up to 5 letters 
df <-  data.frame(num=5)
begin <-  2008 # letters in the range 2008-2012 
i <-  begin
# read the letters
while (i < 2013) {   
	y <- as.character(i)
	# create the file name    
	f <- str_c('http://www.richardtwatson.com/BuffettLetters/',y, 'ltr.txt',sep='')
	# read the letter as on large string
	d <-  readChar(f,nchars=1e6)
	# add letter to the data frame
	df[i-begin+1,] <-  d   
	i <-  i + 1 
} 
# create the corpus
letters <-  Corpus(DataframeSource(as.data.frame(df), encoding = "UTF-8"))  

What is the Flesch-Kincaid score for the 2010 letter?

require(koRpus)
#tokenize the first letter in the corpus
tagged.text <- tokenize(letters[[3]], format="obj",lang="en")
# score
readability(tagged.text, "Flesch.Kincaid", hyphen=NULL,force.lang="en")

Create a term-document matrix and find the words occurring more than 150 times in the letters for 2008-2102. Do appropriate preprocessing.

# convert to lower
clean.letters <-  tm_map(letters,tolower)
# remove punctuation
clean.letters <-  tm_map(clean.letters,removePunctuation)
# remove numbers
clean.letters <-  tm_map(clean.letters,removeNumbers)
# remove stop words 
clean.letters <-  tm_map(clean.letters,removeWords,stopwords('SMART'))
# strip extra white space
clean.letters <-  tm_map(clean.letters,stripWhitespace)
# stem the document -- takes a while to run
stem.letters <- tm_map(clean.letters,stemDocument, language = "english")
# stem completion -- takes a while to run
stem.letters <- tm_map(stem.letters,stemCompletion, dictionary=clean.letters) # create term document matrix -- one row for each term and one column for each document tdm <- TermDocumentMatrix(clean.letters,control = list(minWordLength=3)) dim(tdm) findFreqTerms(tdm, lowfreq = 150,highfreq = Inf)

Report the frequency of the 20 most frequent words. Do several runs to identify words that should be removed from the top 20 and remove them.

# Create a term document matrix
tdm <- TermDocumentMatrix(stem.letters)
# convert term document matrix to a regular matrix to get frequencies of words
m <- as.matrix(tdm)
# sort on frequency of terms to get frequencies of words
v <- sort(rowSums(m), decreasing=TRUE)
# display the 20 most frequent words
v[1:20] # continue the process by removing words

Produce a word cloud for the words identified in the prior exercise.

library(wordcloud)
# select the color palette
pal = brewer.pal(5,"Accent")
# generate the cloud based on the 30 most frequent words
wordcloud(d$word, d$freq, min.freq=d$freq[30],colors=pal)

Select a word and compute its association with other words in the Buffett letters corpus. Adjust the correlation coefficient to get about 10 words

# compute the associations
findAssocs(tdm, "insurance",0.90)

Review the documentation of the hclust function in the stats package and try one or two other clustering techniques.

require(ggplot2)
require(ggdendro)
# setup the document term matrix
tdm <- TermDocumentMatrix(clean.letters)
# name the columns for the letter's year
colnames(tdm) <- 2008:2012
# Remove sparse terms
tdm1 <- removeSparseTerms(tdm, 0.5)
# transpose the matrix
tdmtranspose <- t(tdm1)
cluster = hclust(dist(tdmtranspose),method='centroid')
# get the clustering data
dend <- as.dendrogram(cluster)
# plot the tree
ggdendrogram(dend,rotate=T)

This page is part of the promotional and support material for Data Management (open edition) by Richard T. Watson
For questions and comments please contact the author

Date revised: 02-Dec-2022