## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup--------------------------------------------------------------------
library(tlda)

## ----echo=FALSE---------------------------------------------------------------
set.seed(8)
tdm_excerpt <- biber150_ice_gb[sample(2:151, size = 10), sample(1:500, 8)]
tdm_excerpt

## ----eval=FALSE---------------------------------------------------------------
# addmargins(tdm, margin = 1)

## -----------------------------------------------------------------------------
biber150_ice_gb[1:10, 1:8]

## -----------------------------------------------------------------------------
disp(
  subfreq = biber150_ice_gb[2,], # row 2 in the TDM represents "a"
  partsize = biber150_ice_gb[1,] # row 1 in the TDM contains the part sizes
)

## -----------------------------------------------------------------------------
speaker_word_count <- biber150_spokenBNC2014["word_count",]
subfreq_actually <- biber150_spokenBNC2014["actually",]

## -----------------------------------------------------------------------------
disp(
  subfreq = subfreq_actually,
  partsize = speaker_word_count
)

## -----------------------------------------------------------------------------
disp(
  subfreq = subfreq_actually,
  partsize = speaker_word_count,
  directionality = "gries"
)

## -----------------------------------------------------------------------------
disp(
  subfreq = subfreq_actually,
  partsize = speaker_word_count,
  freq_adjust = TRUE
)

## -----------------------------------------------------------------------------
disp_R(
  subfreq = subfreq_actually,
  partsize = speaker_word_count,
  type = "relative_withsize"
)

## -----------------------------------------------------------------------------
disp_DP(
  subfreq = subfreq_actually,
  partsize = speaker_word_count,
  formula = "gries_2008"
)

## -----------------------------------------------------------------------------
compare_DPs <- rbind(
  disp_DP(subfreq = subfreq_actually,
          partsize = speaker_word_count,
          formula = "gries_2008",
          verbose = FALSE, print_score = FALSE),
  disp_DP(subfreq = subfreq_actually,
          partsize = speaker_word_count,
          formula = "lijffijt_gries_2012",
          verbose = FALSE, print_score = FALSE),
  disp_DP(subfreq = subfreq_actually,
          partsize = speaker_word_count,
          formula = "egbert_etal_2020",
          verbose = FALSE, print_score = FALSE
  ))

rownames(compare_DPs) <- c(
  "Gries (2008)",
  "Lijffijt & Gries (2012)",
  "Egbert et al. (2020)"
)
compare_DPs

## -----------------------------------------------------------------------------
disp_DA(
  subfreq = subfreq_actually,
  partsize = speaker_word_count,
  procedure = "shortcut"
  )

## -----------------------------------------------------------------------------
compare_DAs <- rbind(
  disp_DA(subfreq = subfreq_actually,
          partsize = speaker_word_count,
          procedure = "basic",
          verbose = FALSE, print_score = FALSE),
  disp_DA(subfreq = subfreq_actually,
          partsize = speaker_word_count,
          procedure = "shortcut",
          verbose = FALSE, print_score = FALSE),
  disp_DA(subfreq = subfreq_actually,
          partsize = speaker_word_count,
          procedure = "shortcut_mod",
          verbose = FALSE, print_score = FALSE
  ))

rownames(compare_DAs) <- c(
  "Basic procedure",
  "Shortcut",
  "Shortcut (modified)"
)
compare_DAs

## -----------------------------------------------------------------------------
disp_DKL(
  subfreq = subfreq_actually,
  partsize = speaker_word_count,
  standardization = "base_e"
)

## -----------------------------------------------------------------------------
compare_DKLs <- rbind(
  disp_DKL(subfreq = subfreq_actually,
          partsize = speaker_word_count,
          standardization = "o2p",
          verbose = FALSE, print_score = FALSE),
  disp_DKL(subfreq = subfreq_actually,
          partsize = speaker_word_count,
          standardization = "base_e",
          verbose = FALSE, print_score = FALSE),
  disp_DKL(subfreq = subfreq_actually,
          partsize = speaker_word_count,
          standardization = "base_2",
          verbose = FALSE, print_score = FALSE
  ))

rownames(compare_DKLs) <- c(
  "Odds-to-probability",
  "Base e",
  "Base 2"
)
compare_DKLs

## -----------------------------------------------------------------------------
DM_ice_gb <- disp_tdm(
    tdm = biber150_ice_gb, 
    row_partsize = "first",
    print_score = FALSE,
    verbose = FALSE)

## -----------------------------------------------------------------------------
DM_ice_gb <- data.frame(DM_ice_gb)

## -----------------------------------------------------------------------------
round(DM_ice_gb[1:10,], 2)

## ----fig.width=3.5, fig.height=2.5, warning=FALSE, message=FALSE, fig.align='center', out.width="35%"----
par(mar = c(4, 4, 1, 0.3), xpd = TRUE)

hist(
  DM_ice_gb$DP, 
  main = NULL, 
  xlab = "DP", 
  xlim = c(0,1), 
  breaks = seq(0,1,.05), 
  col = "grey60")

## ----fig.width=3.5, fig.height=2.5, warning=FALSE, message=FALSE, fig.align='center', out.width="35%"----
par(mar = c(4, 4, 1, 0.3), xpd = TRUE)

hist(
  DM_ice_gb$D, 
  main = NULL, 
  xlab = "DP", 
  xlim = c(0,1), 
  breaks = seq(0,1,.05), 
  col = "grey60")

## ----fig.width=3.5, fig.height=2.5, warning=FALSE, message=FALSE, fig.align='center', out.width="35%"----
par(mar = c(4, 4, 1, 0.3), xpd = TRUE)

hist(
  DM_ice_gb$D2, 
  main = NULL, 
  xlab = "DP", 
  xlim = c(0,1), 
  breaks = seq(0,1,.05), 
  col = "grey60")

## ----fig.width=5, fig.height=5, fig.align='center'----------------------------
pairs(DM_ice_gb, gap = 0, cex = .5, cex.labels = 1)

## -----------------------------------------------------------------------------
DM_ice_gb$frequency <- rowSums(biber150_ice_gb[-1,])

## ----fig.width=3, fig.height=2.5, warning=FALSE, message=FALSE, fig.align='center', out.width="35%"----
par(mar = c(4, 4, 1, 0.3), xpd = TRUE)

plot(
  DM_ice_gb$DP ~ log(DM_ice_gb$frequency),
  xlab = "Log frequency",
  ylab = "DP",
  ylim = c(0,1))


## -----------------------------------------------------------------------------
cor(
  DM_ice_gb$DP, 
  log(DM_ice_gb$frequency), 
  method = "spearman",
  use = "complete.obs")

## -----------------------------------------------------------------------------
DM_ice_gb_nofreq <- disp_tdm(
    tdm = biber150_ice_gb, 
    row_partsize = "first",
    freq_adjust = TRUE,
    freq_adjust_method = "even",
    print_score = FALSE,
    verbose = FALSE)

DM_ice_gb_nofreq <- data.frame(DM_ice_gb_nofreq)

DM_ice_gb_nofreq$frequency <- rowSums(biber150_ice_gb[-1,])

str(DM_ice_gb_nofreq)

## ----fig.width=3, fig.height=2.5, warning=FALSE, message=FALSE, fig.align='center', out.width="35%"----
oldpar <- par(mar = c(5.1, 4.1, 4.1, 2.1))

par(mar = c(4, 4, 1, 0.3), xpd = TRUE)

plot(
  DM_ice_gb_nofreq$DP_nofreq ~ log(DM_ice_gb_nofreq$frequency),
  xlab = "Log frequency",
  ylab = "DP",
  ylim = c(0,1))

par(oldpar)

## -----------------------------------------------------------------------------
cor(
  DM_ice_gb_nofreq$DP_nofreq, 
  log(DM_ice_gb_nofreq$frequency), 
  method = "spearman",
  use = "complete.obs")