This tutorial shows how to use Part-o-Speech-tagging (POS) with the openNLP package.
We extract proper nouns (tag NNP for singular and tag NNPS for plural proper nouns) from paragraphs of president’s speeches.
options(stringsAsFactors = FALSE)
library(tm)
# read suto paragraphs
textdata <- read.csv("data/sotu_paragraphs.csv", sep = ";", encoding = "UTF-8")
english_stopwords <- readLines("resources/stopwords_en.txt", encoding = "UTF-8")
# Create corpus object
m <- list(ID = "id", content = "text", DateTimeStamp = "date")
myReader <- readTabular(mapping = m)
corpus <- Corpus(DataframeSource(textdata), readerControl = list(reader = myReader))
require(openNLP)
require(openNLPdata)
# openNLP annotator objects
sent_token_annotator <- Maxent_Sent_Token_Annotator()
word_token_annotator <- Maxent_Word_Token_Annotator()
pos_tag_annotator <- Maxent_POS_Tag_Annotator()
# function for annotation
annotateDocuments <- function(doc, pos_filter = NULL) {
doc <- as.String(doc)
doc_with_annotations <- annotate(doc, list(
sent_token_annotator,
word_token_annotator,
pos_tag_annotator
))
tags <- sapply(subset(doc_with_annotations, type=="word")$features, `[[`, "POS")
tokens <- doc[subset(doc_with_annotations, type=="word")]
res <- paste0(tokens, "_", tags)
if (!is.null(pos_filter)) {
res <- res[tags %in% pos_filter]
}
res <- paste(res, collapse = " ")
return(res)
}
# run annotation on a sample of the corpus
annotated_corpus <- tm_map(corpus[1:10], annotateDocuments)
# Have a look into the first annotated documents
annotated_corpus[[1]]
annotated_corpus[[2]]
We annotate the first paragraphs of the corpus, extract proper nouns, also referred to as Named Entities (NEs) such as person names, locations etc., and compute significance of co-occurrence of them.
sample_corpus <- sapply(corpus[1:1000], annotateDocuments, pos_filter = c("NNP", "NNPS"))
# Binary term matrix
require(Matrix)
minimumFrequency <- 2
filtered_corpus <- Corpus(VectorSource(sample_corpus))
binDTM <- DocumentTermMatrix(filtered_corpus, control=list(bounds = list(global=c(minimumFrequency, Inf)), weighting = weightBin))
binDTM <- sparseMatrix(i = binDTM$i, j = binDTM$j, x = binDTM$v, dims = c(binDTM$nrow, binDTM$ncol), dimnames = dimnames(binDTM))
# Matrix multiplication for cooccurrence counts
coocCounts <- t(binDTM) %*% binDTM
source("calculateCoocStatistics.R")
# Definition of a parameter for the representation of the co-occurrences of a concept
# Determination of the term of which co-competitors are to be measured.
coocTerm <- "washington_nnp"
coocs <- calculateCoocStatistics(coocTerm, binDTM, measure="LOGLIK")
print(coocs[1:20])
## allegheny_nnp baltimore_nnp cape_nnp charleston_nnp december_nnp
## 6.558371 6.558371 6.558371 6.558371 6.558371
## delaware_nnp norfolk_nnp patch_nnp pea_nnp boston_nnp
## 6.558371 6.558371 6.558371 6.558371 5.530002
## carolina_nnp fort_nnp point_nnp niagara_nnp court_nnp
## 5.530002 5.530002 5.530002 4.868551 4.381371
## district_nnp mobile_nnp north_nnp president_nnp chesapeake_nnp
## 4.381371 4.381371 4.381371 4.381371 3.680149
For German language support run
# install.packages("openNLPmodels.de", repos = "http://datacube.wu.ac.at")
# require("openNLPmodels.de")
annotateDocuments
in a way, that consecutive POS-tags get merged into a single token (e.g. “United_NNP States_NNP” becomes “United_States_NNP”).2017, Andreas Niekler and Gregor Wiedemann. GPLv3. tm4ss.github.io