sample = c("You're not crazy and I love you very much.")
y <- sentiment(sample, n.before = 4, n.after=2, amplifier.weight=1)
mean(y$sentiment)
y <- sentiment(sample, n.before = Inf, n.after=Inf, amplifier.weight=1)
mean(y$sentiment)
Create a corpus of Warren Buffet’s letters for the 2008-2010 letters.
require(stringr)
require(tm)
#set up a data frame to hold up to 5 letters
df <- data.frame(num=5)
begin <- 2008 # letters in the range 2008-2012
i <- begin
# read the letters
while (i < 2013) {
y <- as.character(i)
# create the file name
f <- str_c('http://www.richardtwatson.com/BuffettLetters/',y, 'ltr.txt',sep='')
# read the letter as on large string
d <- readChar(f,nchars=1e6)
# add letter to the data frame
df[i-begin+1,] <- d
i <- i + 1
}
# create the corpus
letters <- Corpus(DataframeSource(as.data.frame(df), encoding = "UTF-8"))
What is the Flesch-Kincaid score for the 2010 letter?
require(koRpus) #tokenize the first letter in the corpus tagged.text <- tokenize(letters[[3]], format="obj",lang="en") # score readability(tagged.text, "Flesch.Kincaid", hyphen=NULL,force.lang="en")
Create a term-document matrix and find the words occurring more than 150 times in the letters for 2008-2102. Do appropriate preprocessing.
# convert to lower
clean.letters <- tm_map(letters,tolower)
# remove punctuation
clean.letters <- tm_map(clean.letters,removePunctuation)
# remove numbers
clean.letters <- tm_map(clean.letters,removeNumbers)
# remove stop words
clean.letters <- tm_map(clean.letters,removeWords,stopwords('SMART'))
# strip extra white space
clean.letters <- tm_map(clean.letters,stripWhitespace)
# stem the document -- takes a while to run
stem.letters <- tm_map(clean.letters,stemDocument, language = "english")
# stem completion -- takes a while to run
stem.letters <- tm_map(stem.letters,stemCompletion, dictionary=clean.letters)
# create term document matrix -- one row for each term and one column for each document
tdm <- TermDocumentMatrix(clean.letters,control = list(minWordLength=3))
dim(tdm)
findFreqTerms(tdm, lowfreq = 150,highfreq = Inf)
Report the frequency of the 20 most frequent words. Do several runs to identify words that should be removed from the top 20 and remove them.
# Create a term document matrix
tdm <- TermDocumentMatrix(stem.letters)
# convert term document matrix to a regular matrix to get frequencies of words
m <- as.matrix(tdm)
# sort on frequency of terms to get frequencies of words
v <- sort(rowSums(m), decreasing=TRUE)
# display the 20 most frequent words
v[1:20] # continue the process by removing words
Produce a word cloud for the words identified in the prior exercise.
library(wordcloud)
# select the color palette
pal = brewer.pal(5,"Accent")
# generate the cloud based on the 30 most frequent words
wordcloud(d$word, d$freq, min.freq=d$freq[30],colors=pal)
Select a word and compute its association with other words in the Buffett letters corpus. Adjust the correlation coefficient to get about 10 words
# compute the associations
findAssocs(tdm, "insurance",0.90)
Review the documentation of the hclust function in the stats package and try one or two other clustering techniques.
require(ggplot2)
require(ggdendro)
# setup the document term matrix
tdm <- TermDocumentMatrix(clean.letters)
# name the columns for the letter's year
colnames(tdm) <- 2008:2012
# Remove sparse terms
tdm1 <- removeSparseTerms(tdm, 0.5)
# transpose the matrix
tdmtranspose <- t(tdm1)
cluster = hclust(dist(tdmtranspose),method='centroid')
# get the clustering data
dend <- as.dendrogram(cluster)
# plot the tree
ggdendrogram(dend,rotate=T)
This page is part of the promotional and support material for Data Management (open edition) by Richard T. Watson |