library(xml2)
library(stringr)
doc <- read_html("/Users/ekb5/Downloads/word/document.xml")
paragraphs <- unlist(str_extract_all(as.character(doc), "<p.*?/p>"))
extract_formatted_text <- function(txt) {
# Return paragraphs with bold, italicized, or underlined text
if (str_detect(txt, "</[biu]>")) {
output <- str_replace_all(txt, "<.*?>", "")
} else {
output <- NA
}
return(output)
}
# keep paragraphs with some bold, italicized, or underlined text
result <- unlist(lapply(paragraphs, function(x) extract_formatted_text(x)))
result <- result[!
is.na(result)]
print(result)