Earl Brown
unread,Apr 25, 2013, 5:49:38 PM4/25/13Sign in to reply to author
Sign in to forward
You do not have permission to delete messages in this group
Either email addresses are anonymous for this group or you need the view member email addresses permission to view the original message
to corplin...@googlegroups.com
Hello Corpus Linguistics Rists,
I recently modified Stefan (terrific) function exact.matches to add some additional functionality. I've added three arguments: L1, R1, and sur.wds. L1 = TRUE puts the contextualized matches returned in the fourth element of the returned list in order by one collocate to the left. R1 = TRUE does the same for one collocate to the right. When both are true it first orders them by L1 and then by R1.
When sur.wds = TRUE, L1 and R1 are put into separate columns to the right of the contextualized matches.
You need to have the packages "gsubfn", "data.table", and "gdata" downloaded from CRAN before running my modified version of exact.matches, which I renamed exactMatches to differentiate it. Here's the whole function:
exactMatches <- function (search.expression, corpus.vector, pcre = TRUE, case.sens = TRUE, characters.around = 0, lines.around = 0, clean.up.spaces = TRUE, L1 = FALSE, R1 = FALSE, sur.wds = FALSE) {
# Thanks to Earl Brown for feedback on an earlier version
if (characters.around!=0 & lines.around!=0) {
stop("At least one of 'characters.around' and 'lines.around' has to be zero ...")
}
line.numbers.with.matches <- grep(search.expression, corpus.vector, perl=pcre, value=FALSE, ignore.case=!case.sens) # the numbers of lines that contain matches
if (any(line.numbers.with.matches)) { # if there are matches ...
if (characters.around!=0) {
lines.with.matches <- gsub("( ?_qW1aS3zX5eR7dF9cV_|_qW1aS3zX5eR7dF9cV_ ?)",
" ",
paste(corpus.vector, collapse = "_qW1aS3zX5eR7dF9cV_"),
perl=TRUE)
} else {
lines.with.matches <- corpus.vector[line.numbers.with.matches] # the lines that contain matches
}
matches <- gregexpr(search.expression, lines.with.matches, perl = pcre, ignore.case = !case.sens) # the start positions and lengths of matches
number.of.matches <- sapply(matches, length) # the number of matches per line (of the lines that have matches)
lines <- rep(lines.with.matches, number.of.matches) # the lines with matches, each as many times as it has matches
line.numbers.with.matches <- rep(line.numbers.with.matches, number.of.matches)
starts <- unlist(matches) # starting positions of matches
stops <- starts + unlist(sapply(matches, attr, "match.length")) - 1 # end positions of matches
exact.string.matches <- substr(lines, starts, stops) # the exact matches
lines.with.delimited.matches <- paste( # the lines with the tab-delimited matches
substr(lines, if (characters.around!=0) starts-characters.around else 1, starts-1), "\t", # preceding contexts
exact.string.matches, "\t", # matches
substr(lines, stops+1, if (characters.around!=0) stops+characters.around else nchar(lines)), # subsequent contexts
sep="")
####################
# Earl's attempt to order the contextualized results by collocates
# and to put the surrounding words in their own columns
library("gsubfn")
library("data.table")
library("gdata")
# order by one word to the right
if (R1) {
rogue.tabs <- grep("\t", lines.with.matches)
if (any(rogue.tabs)) stop("You have some rogue tabs in your input. You need to delete them before proceeding as they will jack up the output.")
temp <- strsplit(lines.with.delimited.matches, "\t")
fol.wd <- sapply(temp, function(x) strapplyc(trim(x[3]), "^[ \\.,?!:;\"\']*(\\w+)", backref = 1))
dd <- data.table(x = 1:length(fol.wd), y = format(fol.wd))
new.order <- dd[order(y)]$x
lines.with.delimited.matches <- lines.with.delimited.matches[new.order]
line.numbers.with.matches <- line.numbers.with.matches[new.order]
starts <- starts[new.order]
}
# order by one word to the left
if (L1) {
rogue.tabs <- grep("\t", lines.with.matches)
if (any(rogue.tabs)) stop("You have some rogue tabs in your input. You need to delete them before proceeding as they will jack up the output.")
temp <- strsplit(lines.with.delimited.matches, "\t")
prev.wd <- sapply(temp, function(x) strapplyc(trim(x[1]), "(\\w+).?$", backref = 1))
dd <- data.table(x = 1:length(prev.wd), y = format(prev.wd))
new.order <- dd[order(y)]$x
lines.with.delimited.matches <- lines.with.delimited.matches[new.order]
line.numbers.with.matches <- line.numbers.with.matches[new.order]
starts <- starts[new.order]
}
# puts surrounding words in their own columns
if (sur.wds) {
rogue.tabs <- grep("\t", lines.with.matches)
if (any(rogue.tabs)) stop("You have some rogue tabs in your input. You need to delete them before proceeding as they will jack up the output.")
temp <- strsplit(lines.with.delimited.matches, "\t")
prev.wd <- sapply(temp, function(x) strapplyc(trim(x[1]), "(\\w+).?$", backref = 1))
prev.wd <- sub("character(0)", "NA", format(prev.wd))
fol.wd <- sapply(temp, function(x) strapplyc(trim(x[3]), "^[ \\.,?!:;\"\']*(\\w+)", backref = 1))
fol.wd <- sub("character(0)", "NA", format(fol.wd))
lines.with.delimited.matches <- paste(lines.with.delimited.matches, prev.wd, fol.wd, sep = "\t")
}
# end Earl's attempt
####################
if (lines.around!=0) {
corpus.vector <- append(corpus.vector, rep("", lines.around))
starts.of.previous.lines <- pmax(0, line.numbers.with.matches - lines.around)
ends.of.subsequent.lines <- pmin(line.numbers.with.matches + lines.around, length(corpus.vector))
for (current.line.with.delimited.match in seq(lines.with.delimited.matches)) {
lines.with.delimited.matches[current.line.with.delimited.match] <- paste(
paste(corpus.vector[starts.of.previous.lines[current.line.with.delimited.match]:(line.numbers.with.matches[current.line.with.delimited.match]-1)], collapse=" "),
lines.with.delimited.matches[current.line.with.delimited.match],
paste(corpus.vector[(line.numbers.with.matches[current.line.with.delimited.match]+1):ends.of.subsequent.lines[current.line.with.delimited.match]], collapse=" "),
sep=" ")
}
}
# cleaning output as necessary/requested by user
if (clean.up.spaces) { # clean up spaces around tabs
lines.with.delimited.matches <- gsub(" *\t *", "\t", lines.with.delimited.matches, perl=TRUE)
}
lines.with.delimited.matches <- gsub("(^ {1,}| {1,}$)", "", lines.with.delimited.matches, perl=TRUE) # clean up leading and trailing spaces
output.list <- list(exact.string.matches,
if (characters.around!=0) starts else line.numbers.with.matches, # starting character positions or the numbers of lines with matches, each as many times as it has matches
length(unique(line.numbers.with.matches))/sum(nzchar(corpus.vector)),
lines.with.delimited.matches,
c(Pattern = search.expression, "Corpus (1st 100 char.)"=substr(paste(corpus.vector, collapse=" "), 1, 100), PCRE=pcre, "Case-sensitive"=case.sens),
"1.2 (17 August 2012)")
names(output.list) <- c("Exact matches",
paste("Locations of matches (", ifelse (characters.around!=0, "characters", "lines"), ")", sep=""),
"Proportion of non-empty corpus parts with matches",
"Lines with delimited matches",
"Search parameters",
"Version (date)")
return(output.list)
}
}