f <- readChar("http://dl.dropboxusercontent.com/u/53671029/UGAMIT/UPS_Annual_Reports/2012.txt", nchars=1e6
y <- str_split(f, " ")
# report length of the vector
length(y[[1]])require(koRpus)
#tokenize
check.text <- tokenize(f, format='obj',lang='en')
#score
readability(check.text, 'Flesch.Kincaid',hyphen=NULL,force.lang='en')
#set up data frame to hold 5 UPS Annual Reports
df <- data.frame(num=5)
begin <- 2008
i <- begin
#read the Annual Reports
while (i < 2013) {
y <- as.character(i)
#create the file name
f <- str_c('http://dl.dropboxusercontent.com/u/53671029/UGAMIT/UPS_Annual_Reports/',y,'.txt',sep='')
#read the annual report as on large string
d <- readChar(f,nchars=1e6)
#add annual report to the data frame
df[i-begin+1,] <- d
i <- i + 1
}
#create the corpus
reports <- Corpus(DataframeSource(as.data.frame(df), encoding = "UTF-8"))
require(Snowball)
require(SnowballC)
require(RWeka)
require(rJava)
require(RWekajars)
#convert all letters to lower case
clean.reports <- tm_map(reports,tolower)
#remove punctuation
clean.reports <- tm_map(clean.reports,removePunctuation)
#remove all numbers
clean.reports <- tm_map(clean.reports,removeNumbers)
#strip white space
clean.reports <- tm_map(clean.reports,stripWhitespace)
#stop word filter
clean.reports <- tm_map(clean.reports,removeWords,stopwords("SMART"))
#remove common words
dictionary <- c("UPS", "united", "parcel", "million", "billion", "dollar")
clean.reports <- tm_map(clean.reports,removeWords,dictionary)
#stem words to their roots
stem.reports <- tm_map(clean.reports,stemDocument, language = "english")
tdm <- TermDocumentMatrix(clean.reports,control = list(minWordLength=3))
tdm.stem <- stemCompletion(rownames(tdm), dictionary=clean.reports, type=c("prevalent"))
rownames(tdm) <- as.vector(tdm.stem)
findFreqTerms(tdm, lowfreq = 500, highfreq = Inf)
#convert term document matrix to a regular matrix to get frequencies of words m <- as.matrix(tdm) #sort on frequency of terms to get frequencies of words v <- sort(rowSums(m), decreasing=TRUE) #get the names corresponding to the words names <- names(v) # create a data frame for plotting d <- data.frame(word=names, freq=v) require(wordcloud) #select the color palette pal = brewer.pal(5,"BuGn") #generate the cloud based on the 25 most frequent words wordcloud(d$word, d$freq, min.freq=d$freq[25],colors=pal)
require(ggplot2) require(ggdendro) #name the columns for the report's year colnames(tdm) <- 2008:2012 #remove sparse terms tdm1 <- removeSparseTerms(tdm, 0.5) #transpose the matrix tdmtranspose <- t(tdm1) cluster = hclust(dist(tdmtranspose),method='centroid') #get the clustering data dend <- as.dendrogram(cluster) #plot the tree ggdendrogram(dend,rotate=T)
| This page is part of the promotional and support
material for Data Management (open edition) by Richard T. Watson For questions and comments please contact the author |