Load Packages
library(topicmodels)
library(tm)
library(tidyverse)
library(tidytext) 
library(tidyr)
library(slam)
library(ggrepel) 
library(MASS)
library(textstem)
library(readtext)

Use correlated topic models (CTM) to summarize large text datas from International Movie Data Base (IMDB)’s online movie review platform

Set up the text data

data_dir <- "/Users/Marcus/Documents/R/imdb_pos_reviews"
data_big <- readtext(paste0(data_dir, "/*.txt"), encoding = "UTF-8")
dim(data_big)
## [1] 12500     2

Collect 100 samples

sample_reviews <- sample_n(data_big, 100) 
head(sample_reviews)
## readtext object consisting of 6 documents and 0 docvars.
## # Description: df [6 × 2]
##   doc_id       text               
##   <chr>        <chr>              
## 1 10137_10.txt "\"EXCUSE ME!\"..."
## 2 9461_8.txt   "\"I think a \"..."
## 3 4890_9.txt   "\"I original\"..."
## 4 7792_10.txt  "\"I watched \"..."
## 5 3609_8.txt   "\"Warning: I\"..."
## 6 2736_9.txt   "\"If you rep\"..."

DTM

corpus <- Corpus(VectorSource(sample_reviews$text))
corpus
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 100
text_DTM <- DocumentTermMatrix(corpus,control = list(stemming=TRUE, stopwords = TRUE, minWordLength = 3, removeNumbers = TRUE, removePunctuation = TRUE))
text_DTM
## <<DocumentTermMatrix (documents: 100, terms: 3724)>>
## Non-/sparse entries: 9500/362900
## Sparsity           : 97%
## Maximal term length: 29
## Weighting          : term frequency (tf)
dim(text_DTM)
## [1]  100 3724

TF-IDF

term_tfidf <- tapply(text_DTM$v / row_sums(text_DTM)[text_DTM$i], text_DTM$j, mean) * log2(nDocs(text_DTM)/col_sums(text_DTM > 0))
summary(term_tfidf)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.01176 0.02498 0.04076 0.05423 0.06779 0.66439
plot(density(term_tfidf))

Median

alpha <- 0.047
text_DTM_trimmed <- text_DTM[row_sums(text_DTM) > 0, term_tfidf >= alpha]
dim(text_DTM_trimmed)
## [1]  100 1539

10-fold cross validation

control_CTM_VEM <- list(
  estimate.beta = TRUE, verbose = 0, prefix = tempfile(), save = 0, keep = 0,
  seed = as.integer(Sys.time()), nstart=1L, best = TRUE,
  var = list(iter.max=100, tol=10^-6),
  em = list(iter.max=500, tol=10^-4),
  cg = list(iter.max=100, tol=10^5)
)
set.seed(100)
topics <- c(2, 3, 4, 5, 6, 7, 8, 9, 10, 15)
seed <- 2
D <- length(sample_reviews$text) 
folding <- sample(rep(seq_len(10), ceiling(D))[seq_len(D)])
table(folding)
## folding
##  1  2  3  4  5  6  7  8  9 10 
## 10 10 10 10 10 10 10 10 10 10

Write a loop to automatically output the perplexity

perp_by_col <- vector()
for (k in topics) {
  perp_by_row <- vector()
  for (chain in seq_len(10)) {
    training <- CTM(text_DTM_trimmed[folding != chain,], k = k,
                    control = control_CTM_VEM)
    testing <- CTM(text_DTM_trimmed[folding == chain,], model = training,
                   control = control_CTM_VEM)
    perp_by_row <- rbind(perp_by_row, perplexity(testing))
  }
  perp_by_col <- cbind(perp_by_col, perp_by_row)
}

Plot perplexity following 10-fold cross validation

# Plot perplexity
transpose <- t(perp_by_col)
matplot(transpose, type = "l", col = rainbow(9), lty = 2, lwd = 2, ylab = "Perplexity", xlab = "K", main = "CTM-10-fold cross validation", xaxt="n")
axis(1, at=1:10, labels = c("k=2", "k=3", "k=4", "k=5", "k=6", "k=7", "k=8", "k=9", "k=10", "k=15"), cex=0.5)
perp_by_col_mean <- colMeans(perp_by_col)
lines(perp_by_col_mean, col = "black", lwd = 4, lty = 1)
led <- c("fold=2", "fold=3", "fold=4", "fold=5", "fold=6", "fold=7", "fold=8", "fold=9", "fold=10", "Average")
legend("topright", led, lwd = 2, lty = 2, col = c(rainbow(9), 'black'), cex = 0.65)
abline(v = 4, col = "gray60", lty = 2)

Plot average perplexity

# Average Perplexity
{plot(perp_by_col_mean, pch = 20, ylab = 'Perplexity', xlab = "K", main = "CTM-10-fold cross validation", 
      xaxt = "n") 
  axis(1, at = 1:10, labels = c("k=2","k=3","k=4","k=5","k=6","k=7","k=8","k=9","k=10","k=15"), cex = 0.5)
  lines(perp_by_col_mean, lwd = 1, lty = 2, col = "red")}

CTM

control_CTM_VEM1 <- list(
  estimate.beta = TRUE, verbose=0, prefix=tempfile(),save=0,keep=0,
  seed=1421313709,nstart=1L,best=TRUE,
  var=list(iter.max=500,tol=10^-6),
  em=list(iter.max=1000,tol=10^-4),
  cg=list(iter.max=500,tol=10^5)
)
control_CTM_VEM
## $estimate.beta
## [1] TRUE
## 
## $verbose
## [1] 0
## 
## $prefix
## [1] "/var/folders/pd/lhn8jssd087fgdc1f11ckw280000gn/T//Rtmps6DHcf/file692b7a02f64"
## 
## $save
## [1] 0
## 
## $keep
## [1] 0
## 
## $seed
## [1] 1674031219
## 
## $nstart
## [1] 1
## 
## $best
## [1] TRUE
## 
## $var
## $var$iter.max
## [1] 100
## 
## $var$tol
## [1] 1e-06
## 
## 
## $em
## $em$iter.max
## [1] 500
## 
## $em$tol
## [1] 1e-04
## 
## 
## $cg
## $cg$iter.max
## [1] 100
## 
## $cg$tol
## [1] 1e+05

Generating a 9-topic CTM

CTM9 <- CTM(text_DTM_trimmed, k = 9, control = control_CTM_VEM1, 
            seed = 12244) 
CTM9
## A CTM_VEM topic model with 9 topics.

CTM Output

## A CTM_VEM topic model with 9 topics.
## Topics
topics9 <- posterior(CTM9)$topics
## Let's look at the probability of each document info fits into each of the topics
topics3 <- as.data.frame(topics9)
rownames(topics9) <- sample_reviews$name
print(topics9)
##                  1           2           3           4           5           6
##   [1,] 0.004572189 0.004590897 0.958512637 0.004855974 0.004325874 0.004459373
##   [2,] 0.004641515 0.958950637 0.004642049 0.004555183 0.004960692 0.004492811
##   [3,] 0.004345445 0.004550946 0.004681300 0.004608385 0.004646427 0.004719373
##   [4,] 0.004701423 0.004256805 0.004692488 0.004973038 0.004738387 0.004468124
##   [5,] 0.004583280 0.005016690 0.003865551 0.005293549 0.004913607 0.005414254
##   [6,] 0.004508112 0.004570335 0.004727282 0.958809292 0.004754038 0.004719244
##   [7,] 0.958854895 0.004696501 0.004582116 0.004706307 0.004465673 0.004681347
##   [8,] 0.004716735 0.004721950 0.004625747 0.004645419 0.958884368 0.004739166
##   [9,] 0.004840342 0.004477661 0.004977206 0.004536253 0.004605800 0.004856842
##  [10,] 0.004547528 0.004736575 0.004467184 0.004781352 0.958564644 0.004714116
##  [11,] 0.004369336 0.004812427 0.005089422 0.004275916 0.004276082 0.004990028
##  [12,] 0.004398270 0.004181017 0.005431126 0.004523814 0.004527923 0.004626138
##  [13,] 0.004251027 0.004764759 0.958603483 0.004702641 0.004385279 0.004570910
##  [14,] 0.004640448 0.004715750 0.004727468 0.004552575 0.004499536 0.004344811
##  [15,] 0.073776945 0.068806316 0.081251103 0.072200502 0.067809989 0.077633134
##  [16,] 0.004514617 0.004858208 0.004823637 0.004463588 0.004682889 0.958755897
##  [17,] 0.958983954 0.004567732 0.004478431 0.004587831 0.004713880 0.004761311
##  [18,] 0.004586535 0.957793330 0.004284744 0.005532068 0.004901680 0.004834230
##  [19,] 0.005016588 0.004413223 0.004477130 0.004457514 0.958802581 0.004673301
##  [20,] 0.004443522 0.004795555 0.004758021 0.004889610 0.958778553 0.004463019
##  [21,] 0.004484079 0.004553914 0.004470187 0.004655458 0.958885280 0.005043993
##  [22,] 0.004538554 0.004486020 0.004693006 0.958796268 0.004427973 0.004789045
##  [23,] 0.004447120 0.004493647 0.004540948 0.004682325 0.958787362 0.004681101
##  [24,] 0.004328942 0.004486898 0.005151540 0.004686168 0.004572275 0.005087776
##  [25,] 0.004520181 0.004310422 0.004418731 0.004652149 0.004828145 0.958317549
##  [26,] 0.004400584 0.004521469 0.004661425 0.958337721 0.004169630 0.004680478
##  [27,] 0.004531627 0.004412687 0.005002784 0.004958679 0.004769512 0.004375065
##  [28,] 0.004621317 0.004650226 0.004517196 0.004663961 0.958886608 0.004663911
##  [29,] 0.004518034 0.004531274 0.004755713 0.004949861 0.004257726 0.004457810
##  [30,] 0.004305714 0.005002860 0.004672018 0.958658572 0.004895531 0.004252884
##  [31,] 0.004263493 0.005029162 0.004698407 0.004921405 0.957850225 0.004495580
##  [32,] 0.005130311 0.004566658 0.004302615 0.958408377 0.004531973 0.005481015
##  [33,] 0.004898663 0.958405559 0.004390988 0.004511315 0.004618409 0.004801521
##  [34,] 0.004713253 0.004722477 0.004480897 0.004516470 0.004772725 0.004771897
##  [35,] 0.004527946 0.004632299 0.004350218 0.004881899 0.004389449 0.958514664
##  [36,] 0.004222205 0.004384670 0.004348722 0.957954846 0.004837076 0.005818281
##  [37,] 0.957448534 0.004402171 0.005027641 0.004429937 0.004596815 0.005224537
##  [38,] 0.004442610 0.004751634 0.004328295 0.004747608 0.958783265 0.004852464
##  [39,] 0.004282207 0.004220536 0.004901615 0.005053606 0.958238263 0.004684139
##  [40,] 0.004258079 0.004555753 0.004497606 0.958328186 0.004637666 0.004608441
##  [41,] 0.004500726 0.958927885 0.004468643 0.004665517 0.004728480 0.004706655
##  [42,] 0.085322484 0.065850131 0.078211275 0.067614086 0.083570300 0.065622535
##  [43,] 0.004729124 0.004835191 0.004293302 0.004626415 0.004149648 0.004680620
##  [44,] 0.004616186 0.004718267 0.004449527 0.004824936 0.004783812 0.004485513
##  [45,] 0.004446456 0.958739179 0.004737213 0.005001383 0.004560286 0.004417897
##  [46,] 0.004279056 0.006019290 0.004227529 0.005130279 0.005314008 0.004314578
##  [47,] 0.004514733 0.004349506 0.004600708 0.004809984 0.004998619 0.958458858
##  [48,] 0.004419953 0.004362020 0.004812870 0.005089331 0.004857135 0.958791129
##  [49,] 0.005000368 0.004395257 0.005163322 0.004541570 0.004493335 0.005083763
##  [50,] 0.004532164 0.004383349 0.004780224 0.004623043 0.004553641 0.004887073
##  [51,] 0.005175598 0.004188626 0.004435902 0.004387939 0.004654103 0.005535035
##  [52,] 0.004706992 0.004257053 0.004133200 0.958303551 0.004675232 0.004854668
##  [53,] 0.004514410 0.004655382 0.004338533 0.004533894 0.004715341 0.004620308
##  [54,] 0.004433429 0.004856768 0.004471693 0.004478488 0.958643591 0.004608573
##  [55,] 0.004539072 0.004620176 0.958865078 0.004605073 0.004679400 0.004509290
##  [56,] 0.004892707 0.004499886 0.004732192 0.958770336 0.004613549 0.004618443
##  [57,] 0.005100884 0.004797128 0.004447794 0.004401826 0.004648039 0.958549897
##  [58,] 0.004607554 0.004553632 0.958777424 0.004753952 0.004521267 0.004481138
##  [59,] 0.005096425 0.004307014 0.005005503 0.958407024 0.004561417 0.004337187
##  [60,] 0.004060704 0.004400540 0.004527282 0.004930686 0.005119160 0.958522097
##  [61,] 0.004835011 0.004557930 0.958501395 0.004590893 0.004172892 0.004818776
##  [62,] 0.004451484 0.958869596 0.004817442 0.004515757 0.004550740 0.004622061
##  [63,] 0.004667813 0.958793626 0.004625761 0.004623534 0.004478927 0.004731995
##  [64,] 0.958428423 0.004421239 0.004604068 0.004594456 0.004406903 0.004723934
##  [65,] 0.004854148 0.004954505 0.004353823 0.004321960 0.957987524 0.004720290
##  [66,] 0.004203072 0.004503556 0.004455560 0.004445811 0.958021292 0.005728232
##  [67,] 0.004377479 0.004407082 0.004809036 0.004711473 0.004884299 0.004564555
##  [68,] 0.004479634 0.958667570 0.004294920 0.004615248 0.004707708 0.004552802
##  [69,] 0.004712362 0.005123020 0.004877477 0.004769438 0.004350627 0.004319389
##  [70,] 0.005012850 0.004783305 0.004668242 0.004631732 0.004374793 0.004508630
##  [71,] 0.004698268 0.004401414 0.004612729 0.005456663 0.004329156 0.004551745
##  [72,] 0.937071287 0.006659069 0.006799279 0.007084449 0.006689299 0.007465825
##  [73,] 0.004889540 0.004319155 0.004294201 0.004819900 0.004598569 0.958604755
##  [74,] 0.070252901 0.076234011 0.069500375 0.072997171 0.074027702 0.065028941
##  [75,] 0.004765701 0.004639151 0.004839531 0.004694313 0.004460961 0.004503604
##  [76,] 0.004206476 0.004490482 0.004822409 0.005067040 0.004615767 0.005159243
##  [77,] 0.958582946 0.004390428 0.004213489 0.005124445 0.004582610 0.004911335
##  [78,] 0.004653853 0.958920759 0.004609853 0.004626842 0.004535532 0.004781692
##  [79,] 0.958849128 0.004544444 0.004411656 0.004659536 0.004664330 0.004678867
##  [80,] 0.004779241 0.004330944 0.958628440 0.004593959 0.004471836 0.004716973
##  [81,] 0.004472259 0.004866943 0.004283569 0.004933429 0.004294050 0.958171446
##  [82,] 0.005099169 0.004388352 0.004787232 0.004538969 0.004578854 0.958712840
##  [83,] 0.004628613 0.004218542 0.957551511 0.004723566 0.004381876 0.004855717
##  [84,] 0.004678039 0.004646612 0.004873677 0.958960278 0.004566847 0.004653186
##  [85,] 0.004270957 0.004737119 0.005119906 0.004763445 0.004868758 0.958629964
##  [86,] 0.004497436 0.004595013 0.004636758 0.004978771 0.004458029 0.004567739
##  [87,] 0.004564034 0.004583423 0.958845606 0.004602534 0.004637053 0.004749276
##  [88,] 0.004603991 0.004305948 0.958642007 0.004834793 0.004582578 0.004593598
##  [89,] 0.004755634 0.004185311 0.004536351 0.004954936 0.004346703 0.957984976
##  [90,] 0.004731243 0.004392021 0.004695588 0.004450341 0.005103719 0.004791072
##  [91,] 0.004584986 0.004474544 0.004478057 0.005044957 0.004593062 0.958869575
##  [92,] 0.958577138 0.004459393 0.004901180 0.004717163 0.004480483 0.004645744
##  [93,] 0.004170768 0.004452421 0.004374562 0.956955080 0.004468083 0.004621415
##  [94,] 0.004875094 0.004138409 0.005151108 0.004419952 0.004499561 0.004822398
##  [95,] 0.005273765 0.004440043 0.004336206 0.004346610 0.004883243 0.958617298
##  [96,] 0.089043045 0.076438177 0.080961862 0.082950745 0.076484629 0.106924640
##  [97,] 0.004630170 0.004668572 0.004478331 0.004413894 0.004946473 0.958387927
##  [98,] 0.081267873 0.065162770 0.079710733 0.069212953 0.075713803 0.075335261
##  [99,] 0.958409269 0.004833899 0.005030017 0.004327827 0.004369806 0.004527705
## [100,] 0.004409133 0.958766221 0.004575536 0.004673213 0.004874632 0.004490999
##                  7           8           9
##   [1,] 0.005066257 0.004715185 0.008901613
##   [2,] 0.004516604 0.004752853 0.008487656
##   [3,] 0.005064221 0.958889728 0.008494175
##   [4,] 0.004792054 0.958783952 0.008593730
##   [5,] 0.957815537 0.004551808 0.008545722
##   [6,] 0.004717333 0.004452172 0.008742191
##   [7,] 0.004576846 0.004773752 0.008662562
##   [8,] 0.004543107 0.004462987 0.008660519
##   [9,] 0.958802398 0.004340591 0.008562908
##  [10,] 0.004592242 0.004571876 0.009024484
##  [11,] 0.958581930 0.004904208 0.008700650
##  [12,] 0.958363484 0.005166684 0.008781545
##  [13,] 0.005313653 0.004750715 0.008657533
##  [14,] 0.005105937 0.958723196 0.008690279
##  [15,] 0.068692291 0.068278694 0.421551025
##  [16,] 0.004686077 0.004496136 0.008718951
##  [17,] 0.004873556 0.004585386 0.008447919
##  [18,] 0.004623062 0.004232841 0.009211509
##  [19,] 0.004702683 0.004831825 0.008625156
##  [20,] 0.004629321 0.004583659 0.008658739
##  [21,] 0.004510666 0.004889509 0.008506914
##  [22,] 0.004817893 0.004718722 0.008732519
##  [23,] 0.004672130 0.005026348 0.008669019
##  [24,] 0.958493800 0.004358229 0.008834372
##  [25,] 0.005086709 0.004713357 0.009152757
##  [26,] 0.004840832 0.005722687 0.008665173
##  [27,] 0.958719026 0.004554519 0.008676102
##  [28,] 0.004678401 0.004636860 0.008681521
##  [29,] 0.005164827 0.958552276 0.008812480
##  [30,] 0.004970855 0.004543982 0.008697584
##  [31,] 0.004943067 0.004440079 0.009358582
##  [32,] 0.004362834 0.004538440 0.008677778
##  [33,] 0.004776785 0.004513508 0.009083251
##  [34,] 0.004487346 0.958743864 0.008791072
##  [35,] 0.005197658 0.004587904 0.008917963
##  [36,] 0.005247001 0.004464953 0.008722244
##  [37,] 0.004368592 0.004771625 0.009730148
##  [38,] 0.005032911 0.004476635 0.008584579
##  [39,] 0.005666864 0.004283861 0.008668910
##  [40,] 0.005442761 0.004781664 0.008889844
##  [41,] 0.004776119 0.004649668 0.008576307
##  [42,] 0.069625463 0.064980634 0.419203091
##  [43,] 0.955490030 0.008180490 0.009015179
##  [44,] 0.004635384 0.958933460 0.008552914
##  [45,] 0.004726493 0.004679683 0.008691411
##  [46,] 0.004599179 0.956290131 0.009825950
##  [47,] 0.004287305 0.005189206 0.008791081
##  [48,] 0.004531085 0.004581462 0.008555014
##  [49,] 0.958612444 0.004197061 0.008512881
##  [50,] 0.958847997 0.004759099 0.008633410
##  [51,] 0.957493617 0.005006044 0.009123137
##  [52,] 0.004866405 0.005340660 0.008862239
##  [53,] 0.005152639 0.958460123 0.009009369
##  [54,] 0.004815602 0.004809848 0.008882009
##  [55,] 0.004784906 0.004768220 0.008628784
##  [56,] 0.004633995 0.004485821 0.008753071
##  [57,] 0.004798194 0.004359058 0.008897180
##  [58,] 0.004869151 0.004685668 0.008750215
##  [59,] 0.005216239 0.004378250 0.008690943
##  [60,] 0.004917886 0.004963855 0.008557791
##  [61,] 0.005220016 0.004531763 0.008771323
##  [62,] 0.005049726 0.004630810 0.008492384
##  [63,] 0.004924407 0.004457027 0.008696910
##  [64,] 0.004869573 0.004867145 0.009084260
##  [65,] 0.004609215 0.005011257 0.009187279
##  [66,] 0.005202639 0.004615335 0.008824502
##  [67,] 0.958521386 0.004744451 0.008980239
##  [68,] 0.004797362 0.005098378 0.008786378
##  [69,] 0.958761891 0.004573902 0.008511893
##  [70,] 0.004659247 0.958889571 0.008471630
##  [71,] 0.004713092 0.958708218 0.008528715
##  [72,] 0.007141927 0.007235438 0.013853429
##  [73,] 0.005164300 0.004681409 0.008628171
##  [74,] 0.065762673 0.085372523 0.420823704
##  [75,] 0.004657470 0.958933375 0.008505893
##  [76,] 0.958524220 0.004444398 0.008669965
##  [77,] 0.004908042 0.004545446 0.008741259
##  [78,] 0.004712334 0.004571290 0.008587845
##  [79,] 0.004858477 0.004692745 0.008640819
##  [80,] 0.004508485 0.005288308 0.008681813
##  [81,] 0.005140525 0.004757645 0.009080134
##  [82,] 0.005069428 0.004294066 0.008531090
##  [83,] 0.005323007 0.004856127 0.009461042
##  [84,] 0.004522556 0.004602057 0.008496748
##  [85,] 0.004576674 0.004408216 0.008624962
##  [86,] 0.958871295 0.004824707 0.008570253
##  [87,] 0.004693203 0.004593550 0.008731320
##  [88,] 0.004721726 0.004873567 0.008841792
##  [89,] 0.005994166 0.004352322 0.008889601
##  [90,] 0.004448965 0.958776760 0.008610292
##  [91,] 0.004845741 0.004566548 0.008542529
##  [92,] 0.004586684 0.004703481 0.008928734
##  [93,] 0.007136991 0.004822669 0.008998011
##  [94,] 0.004956075 0.958788336 0.008349067
##  [95,] 0.004988815 0.004419956 0.008694064
##  [96,] 0.084538952 0.082242321 0.320415629
##  [97,] 0.004793874 0.004565292 0.009115467
##  [98,] 0.066368393 0.066860334 0.420367880
##  [99,] 0.004924927 0.004630200 0.008946350
## [100,] 0.004565027 0.004957632 0.008687608
## Let's look at which topic each document is assigned to one of the topics.
main_topic9 <- as.data.frame(topics(CTM9))
rownames(main_topic9) <- sample_reviews$doc_id
colnames(main_topic9) <- "Main_Topic"
print(main_topic9)
##              Main_Topic
## 10137_10.txt          3
## 9461_8.txt            2
## 4890_9.txt            8
## 7792_10.txt           8
## 3609_8.txt            7
## 2736_9.txt            4
## 9513_9.txt            1
## 12389_8.txt           5
## 8513_10.txt           7
## 5270_10.txt           5
## 7339_10.txt           7
## 648_8.txt             7
## 10904_9.txt           3
## 8359_7.txt            8
## 7118_10.txt           9
## 5049_9.txt            6
## 4761_7.txt            1
## 368_8.txt             2
## 3278_10.txt           5
## 7179_10.txt           5
## 8728_7.txt            5
## 1233_10.txt           4
## 10774_10.txt          5
## 149_9.txt             7
## 539_7.txt             6
## 4927_10.txt           4
## 6914_10.txt           7
## 9981_7.txt            5
## 2816_9.txt            8
## 6725_8.txt            4
## 4395_10.txt           5
## 7302_10.txt           4
## 11438_9.txt           2
## 2666_10.txt           8
## 4785_10.txt           6
## 781_9.txt             4
## 3515_8.txt            1
## 131_7.txt             5
## 4976_7.txt            5
## 5601_10.txt           4
## 6475_10.txt           2
## 11720_7.txt           9
## 12059_10.txt          7
## 3853_10.txt           8
## 931_9.txt             2
## 4921_10.txt           8
## 2988_10.txt           6
## 6531_10.txt           6
## 7518_9.txt            7
## 10982_10.txt          7
## 6834_10.txt           7
## 10800_8.txt           4
## 3202_10.txt           8
## 4009_8.txt            5
## 12429_10.txt          3
## 8659_9.txt            4
## 6145_8.txt            6
## 8185_9.txt            3
## 9638_8.txt            4
## 9640_8.txt            6
## 11074_8.txt           3
## 12168_10.txt          2
## 573_10.txt            2
## 50_10.txt             1
## 5668_10.txt           5
## 10758_10.txt          5
## 6160_10.txt           7
## 1304_8.txt            2
## 2879_7.txt            7
## 6947_10.txt           8
## 1546_7.txt            8
## 5819_10.txt           1
## 1863_9.txt            6
## 9542_10.txt           9
## 10422_9.txt           8
## 1498_10.txt           7
## 11678_10.txt          1
## 3431_10.txt           2
## 5475_8.txt            1
## 9239_10.txt           3
## 1655_9.txt            6
## 8638_10.txt           6
## 12078_8.txt           3
## 5651_10.txt           4
## 10728_8.txt           6
## 6231_9.txt            7
## 7395_7.txt            3
## 3292_7.txt            3
## 336_10.txt            6
## 243_9.txt             8
## 2603_9.txt            6
## 5992_10.txt           1
## 8098_7.txt            4
## 658_7.txt             8
## 9319_9.txt            6
## 535_10.txt            9
## 11065_10.txt          6
## 11659_10.txt          9
## 8865_10.txt           1
## 7040_10.txt           2

Top terms

tidy_topics <- tidy(CTM9, matrix = "beta")
# create a top terms df; filtering by top 10 terms per topic 
top_terms <- tidy_topics %>%
  group_by(topic) %>%
  slice_max(beta, n = 10) %>% 
  ungroup() %>%
  arrange(topic, -beta)
top_terms
## # A tibble: 90 × 3
##    topic term        beta
##    <int> <chr>      <dbl>
##  1     1 vito     0.0258 
##  2     1 buddi    0.0233 
##  3     1 mob      0.0172 
##  4     1 ansel    0.0143 
##  5     1 direct   0.0122 
##  6     1 realiti  0.0116 
##  7     1 punk     0.0115 
##  8     1 penelop  0.0115 
##  9     1 scheider 0.0115 
## 10     1 true     0.00905
## # … with 80 more rows

Visualize the top 10 terms for each topic.

top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered()

Multidimensional scaling (MDS)

# Classical MDS
# N rows (objects) x p columns (variables)
d <- dist(topics9) # euclidean distances between the rows
fit <- isoMDS(d, k=2) # k is the number of dim
## initial  value 56.247752 
## iter   5 value 32.626745
## iter  10 value 30.718011
## final  value 30.005768 
## converged
fit
## $points
##                [,1]        [,2]
##   [1,]  0.592831006  1.89702274
##   [2,] -0.229950775  1.31550592
##   [3,] -1.634156900 -0.54474818
##   [4,] -1.628402678 -0.54165623
##   [5,] -1.497195758  1.01533687
##   [6,] -0.660357655 -1.65187497
##   [7,]  1.478006366  1.01016370
##   [8,]  0.773494211 -1.55785320
##   [9,] -1.478943012  1.00033128
##  [10,]  0.769279330 -1.55948804
##  [11,] -1.481141890  1.00827621
##  [12,] -1.482889691  1.01161676
##  [13,]  0.590926156  1.89798634
##  [14,] -1.626421662 -0.54265781
##  [15,] -0.007359938 -0.05832704
##  [16,]  1.659261946 -0.29787949
##  [17,]  1.487067310  1.01233049
##  [18,] -0.228236042  1.30733163
##  [19,]  0.769359682 -1.55951915
##  [20,]  0.770891651 -1.55823164
##  [21,]  0.767421921 -1.55917883
##  [22,] -0.662607235 -1.65144630
##  [23,]  0.770688322 -1.55934946
##  [24,] -1.488562354  1.00192549
##  [25,]  1.661687227 -0.29839316
##  [26,] -0.671283444 -1.65402660
##  [27,] -1.479866579  1.00472664
##  [28,]  0.770724541 -1.55820194
##  [29,] -1.628610994 -0.54135433
##  [30,] -0.670980356 -1.64758596
##  [31,]  0.769391254 -1.56987174
##  [32,] -0.672336367 -1.65599161
##  [33,] -0.225220377  1.30467625
##  [34,] -1.629601314 -0.54552416
##  [35,]  1.661188688 -0.30161866
##  [36,] -0.669164282 -1.65465429
##  [37,]  1.482265450  1.01205850
##  [38,]  0.770372707 -1.55930979
##  [39,]  0.769390949 -1.56372976
##  [40,] -0.668812963 -1.65142726
##  [41,] -0.228681165  1.31200283
##  [42,] -0.079413076 -0.11488953
##  [43,] -1.524547322  0.59164921
##  [44,] -1.627534003 -0.54228024
##  [45,] -0.231668018  1.31901649
##  [46,] -1.614542170 -0.54084366
##  [47,]  1.661464174 -0.29243345
##  [48,]  1.660110788 -0.29318764
##  [49,] -1.490589750  1.00069841
##  [50,] -1.474207937  1.00342729
##  [51,] -1.501277634  1.00882543
##  [52,] -0.670860328 -1.65530367
##  [53,] -1.630306555 -0.54676042
##  [54,]  0.770825137 -1.55867036
##  [55,]  0.590252883  1.89641887
##  [56,] -0.661484784 -1.65369850
##  [57,]  1.659398557 -0.30259012
##  [58,]  0.589478296  1.89663958
##  [59,] -0.674575104 -1.65476025
##  [60,]  1.662628428 -0.29236097
##  [61,]  0.583909215  1.90208919
##  [62,] -0.236989317  1.32104265
##  [63,] -0.230074217  1.32233291
##  [64,]  1.482736280  1.01172471
##  [65,]  0.768401005 -1.56914396
##  [66,]  0.768537422 -1.56865963
##  [67,] -1.480154202  1.00843807
##  [68,] -0.221326951  1.30379462
##  [69,] -1.474858563  1.00489994
##  [70,] -1.629636747 -0.54006680
##  [71,] -1.628144446 -0.54059521
##  [72,]  0.326802070  0.11861366
##  [73,]  1.660820104 -0.30043870
##  [74,] -0.069131639 -0.06473421
##  [75,] -1.626369088 -0.53985773
##  [76,] -1.484627864  1.00307007
##  [77,]  1.481007153  1.01164855
##  [78,] -0.229888028  1.32247475
##  [79,]  1.478977800  1.01005495
##  [80,]  0.584699886  1.89897612
##  [81,]  1.663227854 -0.30202477
##  [82,]  1.660241594 -0.30263193
##  [83,]  0.578769246  1.90802325
##  [84,] -0.659591158 -1.64992925
##  [85,]  1.660208733 -0.29502111
##  [86,] -1.469025383  1.00280995
##  [87,]  0.589275845  1.89610855
##  [88,]  0.588261410  1.89813158
##  [89,]  1.654166474 -0.30247920
##  [90,] -1.629218593 -0.54986581
##  [91,]  1.658984975 -0.29637731
##  [92,]  1.479757739  1.01137251
##  [93,] -0.681732203 -1.60904161
##  [94,] -1.637792239 -0.55446311
##  [95,]  1.659201963 -0.30318475
##  [96,]  0.068610904 -0.13609037
##  [97,]  1.660915769 -0.29863610
##  [98,] -0.006655763 -0.08581533
##  [99,]  1.484278353  1.01089099
## [100,] -0.223192260  1.30627134
## 
## $stress
## [1] 30.00577
# plot solution 
plot_data <- as.data.frame(cbind(fit$points[,1], fit$points[,2], main_topic9$Main_Topic), 
                                 row.names = sample_reviews$doc_id)
colnames(plot_data) <- c("Coordinate1", "Coordinate2", "Main_Topic")
(p1 <- ggplot(data = plot_data, aes(x = Coordinate1, y = Coordinate2)) + geom_point(size=2, shape=23)) 

# Color palette
ggplot(data = plot_data) +
    geom_point(mapping = aes(x = Coordinate1, y = Coordinate2, color = as.factor(Main_Topic))) + theme(legend.position = "none") 

(p5 <- ggplot(plot_data, aes(Coordinate1, Coordinate2, color = as.factor(Main_Topic)))+
    geom_point()+geom_text_repel(aes(label = row.names(plot_data)), size = 3, max.overlaps = 30))

More visualizations

Visualize word differences

library(tidyr)
# using CTM model object, extract beta values and store in a matrix 
review_topics <- tidy(CTM9, matrix = "beta")
# pivot from long to a wide data format and 
# calculate a ratio of topic 2: topic 1 likelihood (uses beta) 
beta_wide1 <- review_topics %>%
  mutate(topic = paste0("topic", topic)) %>%
  pivot_wider(names_from = topic, values_from = beta) %>% 
  filter(topic1 > .001 | topic2 > .001) %>%
  mutate(log_ratio = log2(topic2 / topic1))
# create the plot with a random selection 
beta_wide1 %>% 
  sample_n(size = 25) %>% 
  ggplot(aes(x = reorder(term, desc(log_ratio)), y = log_ratio)) + geom_histogram(stat = 'identity') + coord_flip() + ylab("Log Ratio of beta in Topic 2 / Topic 1") + xlab("Term") 
## Warning in geom_histogram(stat = "identity"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`

beta_wide2 <- review_topics %>%
  mutate(topic = paste0("topic", topic)) %>%
  pivot_wider(names_from = topic, values_from = beta) %>% 
  filter(topic8 > .001 | topic9 > .001) %>%
  mutate(log_ratio = log2(topic9 / topic8))
# create the plot 
beta_wide2 %>% 
  sample_n(size = 50) %>% 
  ggplot(aes(x = reorder(term, desc(log_ratio)), y = log_ratio)) + geom_histogram(stat = 'identity') + coord_flip() + ylab("Log Ratio of beta in Topic 9 / Topic 8") + xlab("Term") 
## Warning in geom_histogram(stat = "identity"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`

Silge and Robinson (2017) - Relationship between sentiment and topics

library(textfeatures) 
# assign topic from CTM to observations 
# use the function topics() from topicmodels library to assign the most
# likely topics for each document (in this case combined reasons) 
topicmodels::topics(CTM9) 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   3   2   8   8   7   4   1   5   7   5   7   7   3   8   9   6   1   2   5   5 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   5   4   5   7   6   4   7   5   8   4   5   4   2   8   6   4   1   5   5   4 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   2   9   7   8   2   8   6   6   7   7   7   4   8   5   3   4   6   3   4   6 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   3   2   2   1   5   5   7   2   7   8   8   1   6   9   8   7   1   2   1   3 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##   6   6   3   4   6   7   3   3   6   8   6   1   4   8   6   9   6   9   1   2
topic_assigned <- as.data.frame(topicmodels::topics(CTM9)) 
topic_assigned$row_id <- rownames(topic_assigned) 
colnames(topic_assigned) <- c("topic_assigned", "row_id") 
topic_assigned
##     topic_assigned row_id
## 1                3      1
## 2                2      2
## 3                8      3
## 4                8      4
## 5                7      5
## 6                4      6
## 7                1      7
## 8                5      8
## 9                7      9
## 10               5     10
## 11               7     11
## 12               7     12
## 13               3     13
## 14               8     14
## 15               9     15
## 16               6     16
## 17               1     17
## 18               2     18
## 19               5     19
## 20               5     20
## 21               5     21
## 22               4     22
## 23               5     23
## 24               7     24
## 25               6     25
## 26               4     26
## 27               7     27
## 28               5     28
## 29               8     29
## 30               4     30
## 31               5     31
## 32               4     32
## 33               2     33
## 34               8     34
## 35               6     35
## 36               4     36
## 37               1     37
## 38               5     38
## 39               5     39
## 40               4     40
## 41               2     41
## 42               9     42
## 43               7     43
## 44               8     44
## 45               2     45
## 46               8     46
## 47               6     47
## 48               6     48
## 49               7     49
## 50               7     50
## 51               7     51
## 52               4     52
## 53               8     53
## 54               5     54
## 55               3     55
## 56               4     56
## 57               6     57
## 58               3     58
## 59               4     59
## 60               6     60
## 61               3     61
## 62               2     62
## 63               2     63
## 64               1     64
## 65               5     65
## 66               5     66
## 67               7     67
## 68               2     68
## 69               7     69
## 70               8     70
## 71               8     71
## 72               1     72
## 73               6     73
## 74               9     74
## 75               8     75
## 76               7     76
## 77               1     77
## 78               2     78
## 79               1     79
## 80               3     80
## 81               6     81
## 82               6     82
## 83               3     83
## 84               4     84
## 85               6     85
## 86               7     86
## 87               3     87
## 88               3     88
## 89               6     89
## 90               8     90
## 91               6     91
## 92               1     92
## 93               4     93
## 94               8     94
## 95               6     95
## 96               9     96
## 97               6     97
## 98               9     98
## 99               1     99
## 100              2    100
# add the same row_id column to the original data: 
sample_clean <- data.frame(sample_reviews) 
sample_clean$row_id <- rownames(sample_clean) 
# perform data join to attach topic proportions to original data
sample_clean <- sample_clean %>% 
  left_join(topic_assigned, by = "row_id")
head(sample_clean) 
##         doc_id
## 1 10137_10.txt
## 2   9461_8.txt
## 3   4890_9.txt
## 4  7792_10.txt
## 5   3609_8.txt
## 6   2736_9.txt
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               text
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               EXCUSE ME!!! HellOOOOOOOOOO!!!!!!!!!!! CUBA GOODING,Jr. Should Have Won An Oscar For His Portrayal In This Film!!! He WAS the film! While the film may be lacking in some areas, Cuba was awesome... and for me, this is the best role that he has ever played! The scene in the movie where he finds out that his mother has died made me break down and cry IN THE THEATER!! I guess I could really relate to this film because I saw the same treatment of people just like that at my own school growing up... what a tragedy! Getting to see the "real" Radio and coach at the end of the movie was really special too! If you can watch this movie and not be moved to tears, you need a heart check! If you liked "Simon Birch" and "The Mighty", you'll love "Radio" too! I wish they made more movies like this...Radio is the Real Deal!
## 2                                                                                                                                                                                                                                                                                                                                                                                                                                              I think a person would be well-advised to read or see (I favour reading) "Twelfth Night" before seeing, or re-seeing "She's The Man". The movie is good on its' own, but comparing the two, and looking for the in-jokes makes it a lot more fun. Shakespeare was inspired by others. I think he'd give a thumbs-up.<br /><br />Harld Bloom said in "Shakespeare, The Invention of the Human", that most of the people in "Twelfth Night" need to be locked up. Malvolio, the person Malcolm is based on, is-for no good reason. People in "She's The Man" are sane in contrast. For instance, Duke Orsino is far more leval than the Duke of Orsino. He also shows that a man can have feelings without being gay. He displays a lot of self-control.<br /><br />It's a teen comedy (a clean one), so it doesn't have the dark edges of the play. For instance Olivia in the play is mourning the death of her brother. In the movie, she has been dumped.<br /><br />If you like Sir Andrew and Sir Toby in the play, they don't have the same attention in the movie.<br /><br />The in-jokes are quite often quick. The hairdresser Pauls' last name is given once. It is Antonio. Lots of people who've read the play say that Antonio has more love for a man than is just friendship. Deep love between men was noted in those days. Some see a sexual side to it-homosexuality was illegal.<br /><br />The only line from the play I caught is where Duke Orsino quotes the coach on greatness during the soccar game. In the play, it is said by Malvolio, quoting Maria).
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                                                             I originally watched this because I thought it was going to be the sequel to the League of EXTRAORDINARY Gentlemen and this movie is a whole different thing entirely going on here-a comedy! However, I loved it anyways! <br /><br />The League of Gentlemen is apparently some British TV series with some rather odd characters and some sharp humour. This is British comedy so it revolves around being very silly, dressing up in costumes and making lots of fun of Germans and french, homosexual references- in short it's very very funny!<br /><br />THe plot revolves around the writers of the TV show deciding to cancel some of the characters and the characters coming out of their dimension into the writer's dimension to stop that from happening. It's a fun twist and there's plenty of great scenes in this idiotic adventure. I laughed out loud numerous times and applauded the brazen style of humour. This makes Mr Bean look like the watered down wimp he is.(Rowan is much better in Black Adder series btw)<br /><br />This is not Monty Python, but you can never escape the comparison when you are talking British humour, and there are a few similarities but not so many as to keep it from being it's own thing and being fresh. It leans more towards the young ones and Guest House Paradiso in its' comedic style.<br /><br />If you liked this, check out The Young Ones series and Guest House Paradiso movie, and of course, I assume the TV series League of Gentlemen must be rather funny as well.
## 4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   I watched this movie probably more than 20 times. The jokes are now 10-15 years old but every time I watch it it makes me fall off my chair. Two of the finest actor Salman Khan and Aamir Khan plays the lead roles here. Even if Aamir Khan is a much better actor and got the better role(smart guy) in the movie Salman Khan matches and sometimes perform even better as the dumb guy. All the characters are memorable. This movie is filled with hilarious one-liners and funny situations(a little too silly probably). Don't try to look for logic in this movie. Let your brain relax for some time. I promise it will be an experience to remember.
## 5 Warning: If the Coen Brothers or David Lynch define your taste in film, disregard this review and move on.<br /><br />Yes, I borrowed the "one line summary" from the book about President Ronald Reagan, but, among other virtues, this movie emphasizes the role that character plays in the lives of honorable human beings. This film is full of honest, decent people, and they have integrity to spare. In a word, they have "character."<br /><br />A small nitpick: Unless you know the history of WW II, you probably don't know that, from Captain Correlli's arrival on the island to the fall of Mussolini, 3 and one-half years have passed. The average viewer might think the romance was of the "whirlwind" variety. That is not so. The romance develops slowly, which gives it both dignity and meaning. The film's deliberate pace may be the director's way of marking time.<br /><br />Some reviews have criticized Cage's Italian accent. The Italian-speaking members of my family assure me that his accent is quite good. <br /><br />The history was right on the mark. Yes, the Germans turned against their Italian allies, who, for the most part, were reluctant allies from the start. If you find that shocking, keep in mind that the French Mediterranean fleet was blown up by the British in 1940, just after France's capitulation, lest it fall in the hands of the Vichy government, or worse, the Nazis.<br /><br />The depiction of the Italians as educated and cultured was a compliment to an educated and cultured civilization.<br /><br />This film was beautifully photographed, and its story was lyrical. The script was not thought-provoking, nor was it clever, but here was a situation where confusion and cleverness were not needed, nor would they have been appropriate. <br /><br />The story is tender, and the message is uplifting. The characters are honest, brave, earnest, sympathetic, and likeable. It's a nice little film. 8/10.<br /><br /> <br /><br />
## 6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     If you repeat a lie enough number of times will it become the truth? 15 park avenue is the story of an alternative reality of a schizophrenic (Mithi). The movie is about her search for her home at a fictitious address where her imaginary husband and 5 children live. Aparna Sen delivers yet another masterpiece. Each and every actor of the movie was better than the other. Konkona Sen looks unbelievably convincing as a schizophrenic. She pulls off the role with such ease and maturity beyond her age. Shabana Azmi is incredible as usual. She plays the dominating and fiercely independent elder sister of Mithi who takes care of her ailing sister and aging mother. She refuses to accept that in-spite of all her strength and courage, she still feels lonely at times. This should have been a very easy movie for Rahul Bose. The role was least bit demanding and anyone could have done the role.<br /><br />The ending of the movie was the most surreal part of the whole park avenue experience. It took me a while to digest that the movie had ended. It left me confused and maybe even a bit disturbed. But later on, it started sinking in. My eyes are black. But if everyone says they are blue, will I still believe that its black??!
##   row_id topic_assigned
## 1      1              3
## 2      2              2
## 3      3              8
## 4      4              8
## 5      5              7
## 6      6              4
sample_clean %>%
  filter(topic_assigned == 7) %>% 
  unnest_tokens(word, text) %>% 
  anti_join(stop_words) %>% 
  inner_join(get_sentiments("bing")) %>%
  mutate(sent_score = ifelse(sentiment == "negative", 1, -1)) %>% 
  group_by(row_id) %>% 
  summarize(sent_count = sum(sent_score)) %>% 
  ggplot(aes(x = row_id, y = sent_count)) + geom_histogram(stat = 'identity') + 
  labs(x = "Review ID", y = "Sentiment Total", title = "Sentiment Distribution for Movie Reviews Categorized by Topic 7")
## Joining, by = "word"
## Joining, by = "word"
## Warning in geom_histogram(stat = "identity"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`