Classify

library(decipher)

wd <- getwd()

# Classification
# create dummy data
data <- data.frame(class = c("Sport", "Business", "Sport", "Sport"),
  doc = c("Football, tennis, golf and, bowling and, score",
          "Marketing, Finance, Legal and, Administration",
          "Tennis, Ski, Golf and, gym and, match",
          "football, climbing and gym"))

# repeat data 50 times to have enough data
# Obviously do not do that in te real world
data <- do.call("rbind", replicate(50, data, simplify = FALSE))

# train model
(model <- dc_train(model = paste0(wd, "/model.bin"), data = data, lang = "en"))
#> Indexing events with TwoPass using cutoff of 5
#> 
#>  Computing event counts...  done. 200 events
#>  Indexing...  done.
#> Sorting and merging events... done. Reduced 200 events to 4.
#> Done indexing in 0.04 s.
#> Incorporating indexed data for training...  
#> done.
#>  Number of Event Tokens: 4
#>      Number of Outcomes: 2
#>    Number of Predicates: 18
#> ...done.
#> Computing model parameters ...
#> Performing 100 iterations.
#>   1:  ... loglikelihood=-138.62943611198907  0.75
#>   2:  ... loglikelihood=-90.05132870679027   1.0
#>   3:  ... loglikelihood=-68.48688352319556   1.0
#>   4:  ... loglikelihood=-55.475703607394934  1.0
#>   5:  ... loglikelihood=-46.62565005433217   1.0
#>   6:  ... loglikelihood=-40.19153916662768   1.0
#>   7:  ... loglikelihood=-35.300372845494124  1.0
#>   8:  ... loglikelihood=-31.45770259589135   1.0
#>   9:  ... loglikelihood=-28.360434714031207  1.0
#>  10:  ... loglikelihood=-25.811965117229352  1.0
#>  11:  ... loglikelihood=-23.67910194735546   1.0
#>  12:  ... loglikelihood=-21.868407310747656  1.0
#>  13:  ... loglikelihood=-20.312411053920847  1.0
#>  14:  ... loglikelihood=-18.961190747341263  1.0
#>  15:  ... loglikelihood=-17.777026534920605  1.0
#>  16:  ... loglikelihood=-16.7308955787924    1.0
#>  17:  ... loglikelihood=-15.800107742996762  1.0
#>  18:  ... loglikelihood=-14.966671656304591  1.0
#>  19:  ... loglikelihood=-14.21614104882692   1.0
#>  20:  ... loglikelihood=-13.536784529138925  1.0
#>  21:  ... loglikelihood=-12.918977853059902  1.0
#>  22:  ... loglikelihood=-12.354752178772456  1.0
#>  23:  ... loglikelihood=-11.837453572211473  1.0
#>  24:  ... loglikelihood=-11.361483099110494  1.0
#>  25:  ... loglikelihood=-10.922096124141827  1.0
#>  26:  ... loglikelihood=-10.515245676878507  1.0
#>  27:  ... loglikelihood=-10.137459008768577  1.0
#>  28:  ... loglikelihood=-9.785739425480292   1.0
#>  29:  ... loglikelihood=-9.457487563266024   1.0
#>  30:  ... loglikelihood=-9.150437765017292   1.0
#>  31:  ... loglikelihood=-8.862606285622736   1.0
#>  32:  ... loglikelihood=-8.592248840697614   1.0
#>  33:  ... loglikelihood=-8.337825591850594   1.0
#>  34:  ... loglikelihood=-8.0979720933992 1.0
#>  35:  ... loglikelihood=-7.871475050325472   1.0
#>  36:  ... loglikelihood=-7.657251983863378   1.0
#>  37:  ... loglikelihood=-7.454334089827545   1.0
#>  38:  ... loglikelihood=-7.261851720330281   1.0
#>  39:  ... loglikelihood=-7.079022032591023   1.0
#>  40:  ... loglikelihood=-6.905138436971859   1.0
#>  41:  ... loglikelihood=-6.739561545993127   1.0
#>  42:  ... loglikelihood=-6.581711381232067   1.0
#>  43:  ... loglikelihood=-6.431060638951914   1.0
#>  44:  ... loglikelihood=-6.287128850518974   1.0
#>  45:  ... loglikelihood=-6.149477302028826   1.0
#>  46:  ... loglikelihood=-6.017704600525814   1.0
#>  47:  ... loglikelihood=-5.891442792880982   1.0
#>  48:  ... loglikelihood=-5.770353958661712   1.0
#>  49:  ... loglikelihood=-5.654127210859458   1.0
#>  50:  ... loglikelihood=-5.542476048674285   1.0
#>  51:  ... loglikelihood=-5.435136015106892   1.0
#>  52:  ... loglikelihood=-5.331862619214681   1.0
#>  53:  ... loglikelihood=-5.232429488815388   1.0
#>  54:  ... loglikelihood=-5.136626724382094   1.0
#>  55:  ... loglikelihood=-5.0442594290404354  1.0
#>  56:  ... loglikelihood=-4.955146393088874   1.0
#>  57:  ... loglikelihood=-4.869118914431885   1.0
#>  58:  ... loglikelihood=-4.786019738832144   1.0
#>  59:  ... loglikelihood=-4.705702106028015   1.0
#>  60:  ... loglikelihood=-4.628028889589052   1.0
#>  61:  ... loglikelihood=-4.5528718199425855  1.0
#>  62:  ... loglikelihood=-4.480110781344061   1.0
#>  63:  ... loglikelihood=-4.409633174714674   1.0
#>  64:  ... loglikelihood=-4.341333339261739   1.0
#>  65:  ... loglikelihood=-4.27511202665486    1.0
#>  66:  ... loglikelihood=-4.210875922272933   1.0
#>  67:  ... loglikelihood=-4.1485372086810575  1.0
#>  68:  ... loglikelihood=-4.088013167057432   1.0
#>  69:  ... loglikelihood=-4.029225812778168   1.0
#>  70:  ... loglikelihood=-3.972101561795021   1.0
#>  71:  ... loglikelihood=-3.916570924814211   1.0
#>  72:  ... loglikelihood=-3.862568226612132   1.0
#>  73:  ... loglikelihood=-3.810031348111636   1.0
#>  74:  ... loglikelihood=-3.758901489095115   1.0
#>  75:  ... loglikelihood=-3.7091229496550318  1.0
#>  76:  ... loglikelihood=-3.6606429286784046  1.0
#>  77:  ... loglikelihood=-3.6134113378376385  1.0
#>  78:  ... loglikelihood=-3.567380629713566   1.0
#>  79:  ... loglikelihood=-3.522505638814703   1.0
#>  80:  ... loglikelihood=-3.4787434343780568  1.0
#>  81:  ... loglikelihood=-3.436053183946128   1.0
#>  82:  ... loglikelihood=-3.3943960268110023  1.0
#>  83:  ... loglikelihood=-3.353734956503564   1.0
#>  84:  ... loglikelihood=-3.3140347115826034  1.0
#>  85:  ... loglikelihood=-3.2752616740483527  1.0
#>  86:  ... loglikelihood=-3.237383774766797   1.0
#>  87:  ... loglikelihood=-3.200370405347152   1.0
#>  88:  ... loglikelihood=-3.1641923359645276  1.0
#>  89:  ... loglikelihood=-3.1288216386655394  1.0
#>  90:  ... loglikelihood=-3.094231615734874   1.0
#>  91:  ... loglikelihood=-3.060396732737595   1.0
#>  92:  ... loglikelihood=-3.027292555884969   1.0
#>  93:  ... loglikelihood=-2.9948956934021496  1.0
#>  94:  ... loglikelihood=-2.963183740602128   1.0
#>  95:  ... loglikelihood=-2.9321352283960547  1.0
#>  96:  ... loglikelihood=-2.9017295749916556  1.0
#>  97:  ... loglikelihood=-2.871947040551748   1.0
#>  98:  ... loglikelihood=-2.8427686846036746  1.0
#>  99:  ... loglikelihood=-2.814176326006921   1.0
#> 100:  ... loglikelihood=-2.7861525053013416  1.0
#> [1] "/home/jp/news-r/decipher/vignettes/model.bin"

# create documents to classify
documents <- data.frame(
  docs = c("This discusses golf which is a sport.",
           "This documents is about business administration.",
           "This is about people who do sport, go to the gym and play tennis.",
           "Some play tennis and work in Finance")
)

# classify documents
classified <- dc(model, documents)
cat(classified)
#> Sport    "This discusses golf which is a sport." "This documents is about business administration." "This is about people who do sport, go to the gym and play tennis." "Some play tennis and work in Finance"