f <- readChar("http://dl.dropboxusercontent.com/u/53671029/UGAMIT/UPS_Annual_Reports/2012.txt", nchars=1e6 y <- str_split(f, " ") # report length of the vector length(y[[1]])
require(koRpus)
#tokenize
check.text <- tokenize(f, format='obj',lang='en')
#score
readability(check.text, 'Flesch.Kincaid',hyphen=NULL,force.lang='en')
#set up data frame to hold 5 UPS Annual Reports df <- data.frame(num=5) begin <- 2008 i <- begin #read the Annual Reports while (i < 2013) { y <- as.character(i) #create the file name f <- str_c('http://dl.dropboxusercontent.com/u/53671029/UGAMIT/UPS_Annual_Reports/',y,'.txt',sep='') #read the annual report as on large string d <- readChar(f,nchars=1e6) #add annual report to the data frame df[i-begin+1,] <- d i <- i + 1 } #create the corpus reports <- Corpus(DataframeSource(as.data.frame(df), encoding = "UTF-8"))
require(Snowball) require(SnowballC) require(RWeka) require(rJava) require(RWekajars) #convert all letters to lower case clean.reports <- tm_map(reports,tolower) #remove punctuation clean.reports <- tm_map(clean.reports,removePunctuation) #remove all numbers clean.reports <- tm_map(clean.reports,removeNumbers) #strip white space clean.reports <- tm_map(clean.reports,stripWhitespace) #stop word filter clean.reports <- tm_map(clean.reports,removeWords,stopwords("SMART")) #remove common words dictionary <- c("UPS", "united", "parcel", "million", "billion", "dollar") clean.reports <- tm_map(clean.reports,removeWords,dictionary) #stem words to their roots stem.reports <- tm_map(clean.reports,stemDocument, language = "english")
tdm <- TermDocumentMatrix(clean.reports,control = list(minWordLength=3)) tdm.stem <- stemCompletion(rownames(tdm), dictionary=clean.reports, type=c("prevalent")) rownames(tdm) <- as.vector(tdm.stem) findFreqTerms(tdm, lowfreq = 500, highfreq = Inf)
#convert term document matrix to a regular matrix to get frequencies of words m <- as.matrix(tdm) #sort on frequency of terms to get frequencies of words v <- sort(rowSums(m), decreasing=TRUE) #get the names corresponding to the words names <- names(v) # create a data frame for plotting d <- data.frame(word=names, freq=v) require(wordcloud) #select the color palette pal = brewer.pal(5,"BuGn") #generate the cloud based on the 25 most frequent words wordcloud(d$word, d$freq, min.freq=d$freq[25],colors=pal)
require(ggplot2) require(ggdendro) #name the columns for the report's year colnames(tdm) <- 2008:2012 #remove sparse terms tdm1 <- removeSparseTerms(tdm, 0.5) #transpose the matrix tdmtranspose <- t(tdm1) cluster = hclust(dist(tdmtranspose),method='centroid') #get the clustering data dend <- as.dendrogram(cluster) #plot the tree ggdendrogram(dend,rotate=T)
This page is part of the promotional and support
material for Data Management (open edition) by Richard T. Watson For questions and comments please contact the author |