library(gsubfn)
# choose the directory from which the corpus files are to be loaded
setwd(choose.dir()) # "F:/Corpora/English/2554/2554/download/Texts"
# load the corpus files and iterate the procedure over all the sub-directories
corpus.files <- list.files(getwd(), recursive=TRUE, full.names=TRUE, pattern="\\.xml$") # 4049 elements
head(corpus.files); tail(corpus.files)
# reserve a data structure for the whole corpus
whole.corpus<-vector()
# load each corpus file into a vector called current.corpus.file and change all corpus lines into lower case
for(i in corpus.files) {
current.corpus.file<-tolower(scan(i, what="char", sep="\n", quiet=T))
# cat(basename(i), "\n") # output a 'progress report'
current.sentences<-grep("<s n=", current.corpus.file, perl=T, value=T) # tell R not to include the header, utterance tags, etc. in our counts
current.sentences<-sub("<s n=.*?>", "", current.sentences, perl=T) # tell R not to use the sentence number in our counts
whole.corpus<-append(whole.corpus, current.sentences) # append the results of these operations to the vector whole.corpus
}