rm(list=ls(all=T))
# set the working directory so that you can access the corpus files you want to process
setwd(choose.dir())
getwd() # "F:/My files/Corpora/English/clmet3_0/plain_text/1850-1920"
# write a function that can create n-grams
word.ngrams <- function (input.vector, gram.length) {
output <- apply( # use the matrix that
mapply( # you get from using mapply
seq, # to generate sequences that
1:(length(input.vector)-(gram.length-1)), # begin at 1, 2, ...
gram.length:length(input.vector) # and end with the relevant gramlengths
), # use the values from that matrix
2, # in a columnwise fashion
function (items) { # an inline/anonymous function that
paste(input.vector[items], # to access and then paste together the subsetted words
collapse=" ") # with spaces between them
} # end of inline/anonymous function
) # end of apply(...)
return(output) # return the output object just created
} # end of function definition
# select the corpus files
corpus.files <- list.files(path = getwd(), pattern = "\\.txt$", full.names = TRUE) # same directory as above
head(corpus.files) # shows the full path to the first six corpus files
length(corpus.files)
# generate a table of trigrams in a for loop ...
for (i in length(corpus.files)) { # enter the for loop
# load the current corpus file
current.corpus.file <- tolower(scan(corpus.files[i], # load the ith file into current.corpus.file
what = character(), # as a character vector
sep = "\n", # with linebreaks as separators between vector elements
quote="", comment.char = "",
quiet = TRUE)) # convert the file to lower case
clean.current.corpus.file <- gsub(".*?<.+?>", "", current.corpus.file, ignore.case = TRUE) # remove the metadata from the file
# create a vector of all words in the corpus
textfile.words <- strsplit(clean.current.corpus.file, # split up the vector clean.current.corpus.file
"[^a-z]+", # at 1+ occurrences of characters that are not letters a-z
perl = TRUE) # using Perl-compatible regular expressions
textfile.words <- unlist(textfile.words) # change the list into a vector
# remove empty character strings
textfile.words <- textfile.words[nzchar(textfile.words)]
trigrams <- word.ngrams(textfile.words, 3)
current.trigrams <- paste(basename(corpus.files[i]), trigrams, sep = "\t") # prefix the name of the corpus file to each vector of trigrams and store the results in a vector
write.table(current.trigrams, file = choose.files(), quote=F, sep="\t", row.names=F, append = TRUE) # output the results in a tab-delimited file
}
.
├── create_trigrams.R
└── data
├── derived
│ └── trigrams.csv
└── original
├── CLMET3_0_3_188.txt.txt
├── CLMET3_0_3_260.txt.txt
└── CLMET3_0_3_333.txt.txt
# SETUP -------------------------------------------------------------------
library(tidyverse) # for data manipulation and piping (%>%)
library(tidytext) # for tokenization
library(readtext) # for reading text and meta-data
# Read data ---------------------------------------------------------------
text <-
readtext(file = "data/original/*", # read all files
encoding = "ISO-8859-1") %>% # set the file encoding
as_tibble() # coerce into tibble data.frame format
# Clean data --------------------------------------------------------------
clean_text <-
text %>% # pass `text` data.frame
unnest_tokens(lines, text, token = "lines") %>% # tokenize by lines
filter(!str_detect(lines, "^<")) %>% # remove meta-data
group_by(doc_id) %>% # group by `doc_id`
summarise(text = str_c(lines, collapse = " ")) %>% # merge `lines`
mutate(text = str_trim(text)) # remove leading/ trailing whitespace
# Tokenize ----------------------------------------------------------------
trigrams <-
clean_text %>%
unnest_tokens(trigrams, text, token = "ngrams", n = 3)
# A tibble: 400,073 x 2
doc_id trigrams
<chr> <chr>
1 CLMET3_0_3_188.txt.txt the four chapters
2 CLMET3_0_3_188.txt.txt four chapters of
3 CLMET3_0_3_188.txt.txt chapters of which
4 CLMET3_0_3_188.txt.txt of which this
5 CLMET3_0_3_188.txt.txt which this work
6 CLMET3_0_3_188.txt.txt this work consists
7 CLMET3_0_3_188.txt.txt work consists originally
8 CLMET3_0_3_188.txt.txt consists originally appeared
9 CLMET3_0_3_188.txt.txt originally appeared as
10 CLMET3_0_3_188.txt.txt appeared as four
# Write data --------------------------------------------------------------
write_csv(trigrams, path = "data/derived/trigrams.csv")
--
You received this message because you are subscribed to the Google Groups "CorpLing with R" group.
To unsubscribe from this group and stop receiving emails from it, send an email to corpling-with-r+unsubscribe@googlegroups.com.
To post to this group, send email to corpling-with-r@googlegroups.com.
Visit this group at https://groups.google.com/group/corpling-with-r.
For more options, visit https://groups.google.com/d/optout.