library(decipher)
wd <- getwd()
# Classification
# create dummy data
data <- data.frame(class = c("Sport", "Business", "Sport", "Sport"),
doc = c("Football, tennis, golf and, bowling and, score",
"Marketing, Finance, Legal and, Administration",
"Tennis, Ski, Golf and, gym and, match",
"football, climbing and gym"))
# repeat data 50 times to have enough data
# Obviously do not do that in te real world
data <- do.call("rbind", replicate(50, data, simplify = FALSE))
# train model
(model <- dc_train(model = paste0(wd, "/model.bin"), data = data, lang = "en"))
#> Indexing events with TwoPass using cutoff of 5
#>
#> Computing event counts... done. 200 events
#> Indexing... done.
#> Sorting and merging events... done. Reduced 200 events to 4.
#> Done indexing in 0.04 s.
#> Incorporating indexed data for training...
#> done.
#> Number of Event Tokens: 4
#> Number of Outcomes: 2
#> Number of Predicates: 18
#> ...done.
#> Computing model parameters ...
#> Performing 100 iterations.
#> 1: ... loglikelihood=-138.62943611198907 0.75
#> 2: ... loglikelihood=-90.05132870679027 1.0
#> 3: ... loglikelihood=-68.48688352319556 1.0
#> 4: ... loglikelihood=-55.475703607394934 1.0
#> 5: ... loglikelihood=-46.62565005433217 1.0
#> 6: ... loglikelihood=-40.19153916662768 1.0
#> 7: ... loglikelihood=-35.300372845494124 1.0
#> 8: ... loglikelihood=-31.45770259589135 1.0
#> 9: ... loglikelihood=-28.360434714031207 1.0
#> 10: ... loglikelihood=-25.811965117229352 1.0
#> 11: ... loglikelihood=-23.67910194735546 1.0
#> 12: ... loglikelihood=-21.868407310747656 1.0
#> 13: ... loglikelihood=-20.312411053920847 1.0
#> 14: ... loglikelihood=-18.961190747341263 1.0
#> 15: ... loglikelihood=-17.777026534920605 1.0
#> 16: ... loglikelihood=-16.7308955787924 1.0
#> 17: ... loglikelihood=-15.800107742996762 1.0
#> 18: ... loglikelihood=-14.966671656304591 1.0
#> 19: ... loglikelihood=-14.21614104882692 1.0
#> 20: ... loglikelihood=-13.536784529138925 1.0
#> 21: ... loglikelihood=-12.918977853059902 1.0
#> 22: ... loglikelihood=-12.354752178772456 1.0
#> 23: ... loglikelihood=-11.837453572211473 1.0
#> 24: ... loglikelihood=-11.361483099110494 1.0
#> 25: ... loglikelihood=-10.922096124141827 1.0
#> 26: ... loglikelihood=-10.515245676878507 1.0
#> 27: ... loglikelihood=-10.137459008768577 1.0
#> 28: ... loglikelihood=-9.785739425480292 1.0
#> 29: ... loglikelihood=-9.457487563266024 1.0
#> 30: ... loglikelihood=-9.150437765017292 1.0
#> 31: ... loglikelihood=-8.862606285622736 1.0
#> 32: ... loglikelihood=-8.592248840697614 1.0
#> 33: ... loglikelihood=-8.337825591850594 1.0
#> 34: ... loglikelihood=-8.0979720933992 1.0
#> 35: ... loglikelihood=-7.871475050325472 1.0
#> 36: ... loglikelihood=-7.657251983863378 1.0
#> 37: ... loglikelihood=-7.454334089827545 1.0
#> 38: ... loglikelihood=-7.261851720330281 1.0
#> 39: ... loglikelihood=-7.079022032591023 1.0
#> 40: ... loglikelihood=-6.905138436971859 1.0
#> 41: ... loglikelihood=-6.739561545993127 1.0
#> 42: ... loglikelihood=-6.581711381232067 1.0
#> 43: ... loglikelihood=-6.431060638951914 1.0
#> 44: ... loglikelihood=-6.287128850518974 1.0
#> 45: ... loglikelihood=-6.149477302028826 1.0
#> 46: ... loglikelihood=-6.017704600525814 1.0
#> 47: ... loglikelihood=-5.891442792880982 1.0
#> 48: ... loglikelihood=-5.770353958661712 1.0
#> 49: ... loglikelihood=-5.654127210859458 1.0
#> 50: ... loglikelihood=-5.542476048674285 1.0
#> 51: ... loglikelihood=-5.435136015106892 1.0
#> 52: ... loglikelihood=-5.331862619214681 1.0
#> 53: ... loglikelihood=-5.232429488815388 1.0
#> 54: ... loglikelihood=-5.136626724382094 1.0
#> 55: ... loglikelihood=-5.0442594290404354 1.0
#> 56: ... loglikelihood=-4.955146393088874 1.0
#> 57: ... loglikelihood=-4.869118914431885 1.0
#> 58: ... loglikelihood=-4.786019738832144 1.0
#> 59: ... loglikelihood=-4.705702106028015 1.0
#> 60: ... loglikelihood=-4.628028889589052 1.0
#> 61: ... loglikelihood=-4.5528718199425855 1.0
#> 62: ... loglikelihood=-4.480110781344061 1.0
#> 63: ... loglikelihood=-4.409633174714674 1.0
#> 64: ... loglikelihood=-4.341333339261739 1.0
#> 65: ... loglikelihood=-4.27511202665486 1.0
#> 66: ... loglikelihood=-4.210875922272933 1.0
#> 67: ... loglikelihood=-4.1485372086810575 1.0
#> 68: ... loglikelihood=-4.088013167057432 1.0
#> 69: ... loglikelihood=-4.029225812778168 1.0
#> 70: ... loglikelihood=-3.972101561795021 1.0
#> 71: ... loglikelihood=-3.916570924814211 1.0
#> 72: ... loglikelihood=-3.862568226612132 1.0
#> 73: ... loglikelihood=-3.810031348111636 1.0
#> 74: ... loglikelihood=-3.758901489095115 1.0
#> 75: ... loglikelihood=-3.7091229496550318 1.0
#> 76: ... loglikelihood=-3.6606429286784046 1.0
#> 77: ... loglikelihood=-3.6134113378376385 1.0
#> 78: ... loglikelihood=-3.567380629713566 1.0
#> 79: ... loglikelihood=-3.522505638814703 1.0
#> 80: ... loglikelihood=-3.4787434343780568 1.0
#> 81: ... loglikelihood=-3.436053183946128 1.0
#> 82: ... loglikelihood=-3.3943960268110023 1.0
#> 83: ... loglikelihood=-3.353734956503564 1.0
#> 84: ... loglikelihood=-3.3140347115826034 1.0
#> 85: ... loglikelihood=-3.2752616740483527 1.0
#> 86: ... loglikelihood=-3.237383774766797 1.0
#> 87: ... loglikelihood=-3.200370405347152 1.0
#> 88: ... loglikelihood=-3.1641923359645276 1.0
#> 89: ... loglikelihood=-3.1288216386655394 1.0
#> 90: ... loglikelihood=-3.094231615734874 1.0
#> 91: ... loglikelihood=-3.060396732737595 1.0
#> 92: ... loglikelihood=-3.027292555884969 1.0
#> 93: ... loglikelihood=-2.9948956934021496 1.0
#> 94: ... loglikelihood=-2.963183740602128 1.0
#> 95: ... loglikelihood=-2.9321352283960547 1.0
#> 96: ... loglikelihood=-2.9017295749916556 1.0
#> 97: ... loglikelihood=-2.871947040551748 1.0
#> 98: ... loglikelihood=-2.8427686846036746 1.0
#> 99: ... loglikelihood=-2.814176326006921 1.0
#> 100: ... loglikelihood=-2.7861525053013416 1.0
#> [1] "/home/jp/news-r/decipher/vignettes/model.bin"
# create documents to classify
documents <- data.frame(
docs = c("This discusses golf which is a sport.",
"This documents is about business administration.",
"This is about people who do sport, go to the gym and play tennis.",
"Some play tennis and work in Finance")
)
# classify documents
classified <- dc(model, documents)
cat(classified)
#> Sport "This discusses golf which is a sport." "This documents is about business administration." "This is about people who do sport, go to the gym and play tennis." "Some play tennis and work in Finance"