gReetings!
Please excuse the rather long post and the sometimes tedious comments in my script. I have included my complete script for reference.
I'm analyzing word frequencies in multiple text files. My question has to do with the last two sections in which I calculate range (the number of texts in which a particular word type occurs) and Gries' DP. I'm using the simple, non-normalized version of DP, since I'm analyzing hundreds or thousands of files and each is exactly the same length (10,000 tokens).
Running this script on 705 files (7.05 million tokens) in a 64-bit version of R (2.15.1) on a computer with 8 GB of RAM and an Intel i5 3.40 GHz CPU took approximately 5 hours. When I increased that to a little more than 18 million tokens (1,842 files) it took around three days to finish. My hope is to also run the script on a collection of nearly 50 million. I fear that will take a month to finish.
Many thanks for any suggestions!
Joseph
#MultiFile_frequency_range_DP.r
#
library(gtools) #The mixedsort function that is used repeatedly in this script is from the Warnes’ (2012) gtools library.
selected.files <- list.files(path=getwd(), pattern="*.txt") #Selects all the files from the working directory.
selected.files <- mixedsort(selected.files) #Reorders the files in their proper numeric order.
combined.list = NULL #Initiates the vector to receive the combined list.
length(combined.list) <- 10000000 #preallocates memory space to the vector so that a new space does not need to be found as it is enlarged with each iteration of the loop below.
for (i in 1:length(selected.files)) {
text.file<-scan(selected.files[[i]], what="char", sep="\n", quote="", comment.char="") #Inputs the text file.
text.file<-tolower(text.file) #Changes all alphabetic characters to lower case.
text.file<-gsub("<.*?>", "", text.file, perl=T) #Removes tags.
word.list<-strsplit(text.file, "\\W+") #Extracts words from the file.
word.vector<-unlist(word.list) #Changes output from strsplit back into a vector.
word.vector<-word.vector[nchar(word.vector)>0] #Removes any remaining empty strings.
freq.list<-table(word.vector) #Creates a table of named integers (word types and their frequencies).
assign(paste("freq.list",i,sep="."),freq.list) #Creates a unique file name for each iteration of this loop.
combined.list<-c(combined.list, freq.list) #Adds each frequency list to combined list.
}
combined.freq <- as.table(tapply(combined.list, names(combined.list), sum)) #Transforms the series of individual frequency lists into a unified frequency list.
combined.sorted.freq.list<-sort(combined.freq, decreasing=T) #Sorts the frequency list in descending order of frequency.
all.freq.lists <- ls(pattern="freq.list.\\d{1,4}") #Lists all frequency files in memory with 1-4 digits. Actually, it will list all files with more than that many digits, too.
all.freq.lists <- mixedsort(all.freq.lists) #Reorders the lists numerically.
for (j in 1:length(all.freq.lists)) { #Loop sorts each individual frequency list into the combined order.
freq.list.j<-get(all.freq.lists[[j]])
j.in.combined<-freq.list.j[freq.list.j=names(combined.sorted.freq.list)] #Pastes the names of the frequency vector alongside the frequency integers in a matrix.
j.in.combined[
is.na(j.in.combined)] <- 0 #Changes "NA" to numeric 0.
assign(paste("combined", j, sep="."), j.in.combined) #Writes the current frequency list to file.
}
all.j.in.combined<-ls(pattern="combined.\\d{1,4}") #Lists all combined files in memory with 1-4 digits.
all.j.in.combined <- mixedsort(all.j.in.combined)
combined.file.count<-1:length(selected.files) #Counts the number of files originally selected.
combined.file.list<-paste("combined", combined.file.count, sep=".") #Creates the file names for the combined lists by catenating "combined" with each file number separated by a period.
combined.table<-paste(names(combined.sorted.freq.list), combined.sorted.freq.list, sep="\t") #Creates a table with columns for the words of the combined vocabulary and the total frequency.
for (x in 1:length(combined.file.list)) {
add.to.table<-get(combined.file.list[[x]])
combined.table<-paste(combined.table, add.to.table, sep="\t") #Adds each of the component lists to the combined table.
}
header<-paste(selected.files, sep="\t") #Creates header labels for the component frequency lists.
table.header <- c("Word_type", "Total_Frequency", header, "\n") #Adds column labels for the Word_type and total columns. The hard return "\n" at the end will force the first row of the frequency table onto the second line of the file.
cat(table.header, file="word_frequencies.csv", sep="\t") #Saves the header row to a spreadsheet file.
cat(combined.table, file="word_frequencies.csv", sep="\n", append=TRUE) #Adds the table to the spreadsheet file.
rm(list=ls(pattern="freq.list.\\d*|combined.\\d*")) #Removes temporary files used to create the frequency table.
#
#vocabulary range
#
vocab.table <- read.csv("word_frequencies.csv", header=TRUE, sep="\t")
vocab.range = NULL #Initiates the vector to receive the range scores. Range, here, means the number of files in which a word type occurs.
length(vocab.range) <- 900000 #Preallocates memory space to the vector so that a new space does not need to be found as it is enlarged with each iteration of the loop below.
v <- length(names(combined.sorted.freq.list)) #Calculates the number of word types in the combined list
f <- length(combined.file.count)
x <- length(combined.file.count)+2 #Calculates the column number of the final file, i.e. the number of files plus the word_type and total columns.
for (i in 1:v) {
range.count <- sum(vocab.table[i,3:x] >0) #Counts number of files in which each type occurs, i.e. frequency is >0.
vocab.range <- paste(vocab.range, range.count, sep="\n")
}
cat("Range", file="vocab_range.csv")
cat(vocab.range, file="vocab_range.csv", sep="\n", append=TRUE)
#
range.table <-unlist(read.csv("vocab_range.csv", header=TRUE, sep="\t"))
range_freq.table <- paste(names(combined.sorted.freq.list), combined.sorted.freq.list, range.table, sep="\t")
range.freq.table.header <- c("Word_type", "Total_Frequency", "Range", "\n") #Adds column labels for the Word_type and total columns. The hard return "\n" at the end will force the first row of the frequency table onto the second line of the file.
cat(range.freq.table.header, file="word_freq_range.csv", sep="\t")
cat(range_freq.table, file="word_freq_range.csv", sep="\n", append=TRUE)
#
#DP (Gries, 2008, 2010)
#Gries, Stefan Th. 2008. Dispersions and adjusted frequencies in corpora. /International Journal of Corpus Linguistics/ 13(4). 403-437.
#Gries, Stefan Th. 2010. Dispersions and adjusted frequencies in corpora: further explorations. In Stefan Th. Gries, Stefanie Wulff, & Mark Davies (eds.), /Corpus linguistic applications: current studies, new directions/, 197-212. Amsterdam: Rodopi.
#
vocab.DP = NULL #Initiates the vector to receive the range scores.
length(vocab.DP) <- 1000000 #Preallocates memory space to the vector so that a new space does not need to be found as it is enlarged with each iteration of the loop below.
for (j in 1:v) {
DP.score <- sum(abs((1/f)-vocab.table[j,3:x]/vocab.table[j,2]))/2
vocab.DP <- paste(vocab.DP, DP.score, sep="\n")
}
cat("DP", file="vocab_DP.csv")
cat(vocab.DP, file="vocab_DP.csv", sep="\n", append=TRUE)
#
DP.table <-unlist(read.csv("vocab_DP.csv", header=TRUE, sep="\t"))
range_freq.DP.table <- paste(names(combined.sorted.freq.list), combined.sorted.freq.list, range.table, DP.table, sep="\t") #Creates a file with the combined list of word types present in the scanned files, each word's total frequency, its range and DP score.
range.freq.DP.table.header <- c("Word_type", "Total_Frequency", "Range", "DP", "\n") #Adds column labels for the Word_type and total columns. The hard return "\n" at the end will force the first row of the frequency table onto the second line of the file.
cat(range.freq.DP.table.header, file="word_freq_range_DP.csv", sep="\t")
cat(range_freq.DP.table, file="word_freq_range_DP.csv", sep="\n", append=TRUE)
rm(list=ls(all=T)) #Clears the memory so files do not interfere with subsequent processes.
#
#End of script