namefinder.Rmd
library(decipher)
# get working directory
# need to pass full path
wd <- getwd()
# Training to find "WEF"
data <- paste("This organisation is called the <START:wef> World Economic Forum <END>",
"It is often referred to as <START:wef> Davos <END> or the <START:wef> WEF <END> .")
# train the model
(
model <- tnf_train(model = paste0(wd, "/wef.bin"), lang = "en",
data = data, type = "wef")
)
#> Indexing events with TwoPass using cutoff of 0
#>
#> Computing event counts... done. 19 events
#> Indexing... done.
#> Collecting events... Done indexing in 0.02 s.
#> Incorporating indexed data for training...
#> done.
#> Number of Event Tokens: 19
#> Number of Outcomes: 3
#> Number of Predicates: 256
#> Computing model parameters...
#> Performing 300 iterations.
#> 1: . (13/19) 0.6842105263157895
#> 2: . (17/19) 0.8947368421052632
#> 3: . (19/19) 1.0
#> 4: . (19/19) 1.0
#> 5: . (19/19) 1.0
#> 6: . (19/19) 1.0
#> Stopping: change in training set accuracy less than 1.0E-5
#> Stats: (19/19) 1.0
#> ...done.
#>
#> Training data summary:
#> #Sentences: 1
#> #Tokens: 19
#> #wef entities: 3
#> [1] "/home/jp/news-r/decipher/vignettes/wef.bin"
# Create sentences to test our model
sentences <- paste("This sentence mentions the World Economic Forum the annual meeting",
"of which takes place in Davos. Note that the forum is often shortened to WEF.")
# run model on sentences
(results <- tnf(model = model, sentences = sentences))
#> [1] "This sentence mentions the <START:wef> World Economic <END> Forum the annual meeting of which takes place in <START:wef> Davos. <END> Note that the forum is often shortened to <START:wef> WEF. <END>"
#> [1] "This sentence mentions the <START:wef> World Economic <END> Forum the annual meeting of which takes place in <START:wef> Davos. <END> Note that the forum is often shortened to <START:wef> WEF. <END>"
# get names from results
(names <- get_names(results))
#> # A tibble: 3 x 3
#> string type name
#> <chr> <chr> <chr>
#> 1 <START:wef> World Economic <END> wef World Economic
#> 2 <START:wef> Davos. <END> wef Davos
#> 3 <START:wef> WEF. <END> wef WEF
You can also do train and run your model from .txt
files.
# same with .txt files
# Training to find "WEF"
data <- paste("This organisation is called the <START:wef> World Economic Forum <END>",
"It is often referred to as <START:wef> Davos <END> or the <START:wef> WEF <END> .")
# Save the above as file
write(data, file = "input.txt")
# Trains the model and returns the full path to the model
(
model <- tnf_train_(model = paste0(wd, "/wef.bin"), lang = "en",
data = paste0(wd, "/input.txt"), type = "wef")
)
#> Indexing events with TwoPass using cutoff of 0
#>
#> Computing event counts... done. 19 events
#> Indexing... done.
#> Collecting events... Done indexing in 0.02 s.
#> Incorporating indexed data for training...
#> done.
#> Number of Event Tokens: 19
#> Number of Outcomes: 3
#> Number of Predicates: 256
#> Computing model parameters...
#> Performing 300 iterations.
#> 1: . (13/19) 0.6842105263157895
#> 2: . (17/19) 0.8947368421052632
#> 3: . (19/19) 1.0
#> 4: . (19/19) 1.0
#> 5: . (19/19) 1.0
#> 6: . (19/19) 1.0
#> Stopping: change in training set accuracy less than 1.0E-5
#> Stats: (19/19) 1.0
#> ...done.
#>
#> Training data summary:
#> #Sentences: 1
#> #Tokens: 19
#> #wef entities: 3
#> [1] "/home/jp/news-r/decipher/vignettes/wef.bin"
# Create sentences to test our model
sentences <- paste("This sentence mentions the World Economic Forum the annual meeting",
"of which takes place in Davos. Note that the forum is often called the WEF.")
# Save sentences
write(data, file = "sentences.txt")
# Extract names
# Without specifying an output file the extracted names appear in the console
(model <- tnf_(model = model, sentences = paste0(wd, "/sentences.txt")))
#> This organisation is called the <START:wef> <START:wef> World Economic Forum <END> <END> It is often referred to as <START:wef> <START:wef> <END> Davos <END> or the <START:wef> <START:wef> <END> WEF <END> .
#> NULL
# You can train slightly more sophisticated models too
# Training to find sentiments
data <- paste("This sentence is <START:sentiment.neg> very bad <END> !",
"This sentence is <START:sentiment.pos> rather good <END> .",
"This sentence on the other hand, is <START:sentiment.neg> horrible <END> .")
# Save the above as file
write(data, file = "input.txt")
# Trains the model and returns the full path to the model
(
model <- tnf_train_(model = paste0(wd, "/sentiment.bin"), lang = "en",
data = paste0(wd, "/input.txt"), type = "sentiment")
)
#> Indexing events with TwoPass using cutoff of 0
#>
#> Computing event counts... done. 21 events
#> Indexing... done.
#> Collecting events... Done indexing in 0.03 s.
#> Incorporating indexed data for training...
#> done.
#> Number of Event Tokens: 21
#> Number of Outcomes: 3
#> Number of Predicates: 227
#> Computing model parameters...
#> Performing 300 iterations.
#> 1: . (15/21) 0.7142857142857143
#> 2: . (20/21) 0.9523809523809523
#> 3: . (21/21) 1.0
#> 4: . (21/21) 1.0
#> 5: . (21/21) 1.0
#> 6: . (21/21) 1.0
#> Stopping: change in training set accuracy less than 1.0E-5
#> Stats: (21/21) 1.0
#> ...done.
#>
#> Training data summary:
#> #Sentences: 1
#> #Tokens: 21
#> #sentiment.neg entities: 2
#> #sentiment.pos entities: 1
#> [1] "/home/jp/news-r/decipher/vignettes/sentiment.bin"
sentences <- paste("The first half of this sentence is a bad and negative while",
"the second half is great and positive.")
# Save sentences
write(data, file = "sentences.txt")
# Extract names
# Without specifying an output file the extracted names appear in the console
(tnf_(model = model, sentences = paste0(wd, "/sentences.txt")))
#> This sentence is <START:sentiment.neg> <START:sentiment> very bad <END> <END> ! This sentence is <START:sentiment.pos> rather <START:sentiment> good <END> <END> . This sentence on the other hand, is <START:sentiment> <START:sentiment.neg> horrible <END> <END> .
#> NULL