## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----charheavytail,fig.show='hold'-------------------------------------------- library(classInt) #1. Characterization of heavy-tail distributions---- set.seed(1234) #Pareto distribution a=1 b=1.161 n=1000 sample_par <- 1 / (1 - runif(1000)) ^ (1 / 1.161) opar <- par(no.readonly = TRUE) par(mar = c(2, 4, 3, 1), cex = 0.8) plot( sort(sample_par, decreasing = TRUE), type = "l", ylab = "F(x)", xlab = "", main = "80/20 principle" ) abline(h = quantile(sample_par, .8) , lty = 2, col = "red3") abline(v = 0.2*length(sample_par) , lty = 2, col = "darkblue") legend( "topleft", legend = c("F(x): p80", "x: Top 20%"), col = c("red3", "darkblue"), lty = 2, cex = 0.8 ) hist( sample_par, n = 100, xlab = "", main = "Histogram", col = "grey50", border = NA, probability = TRUE ) par(opar) ## ----stepbystep, fig.show='hold'---------------------------------------------- opar <- par(no.readonly = TRUE) par(mar = c(2, 2, 3, 1), cex = 0.8) var <- sample_par thr <- .4 brks <- c(min(var), max(var)) #Initialise with min and max sum_table <- data.frame( iter = 0, mu = NA, prop = NA, n_var = NA, n_head = NA ) #Pars for chart limchart <- brks #Iteration for (i in 1:10) { mu <- mean(var) brks <- sort(c(brks, mu)) head <- var[var > mu] prop <- length(head) / length(var) stopit <- prop < thr & length(head) > 1 sum_table = rbind(sum_table, c(i, mu, prop, length(var), length(head))) hist( var, main = paste0("Iter ", i), breaks = 50, col = "grey50", border = NA, xlab = "", xlim = limchart ) abline(v = mu, col = "red3", lty = 2) ylabel <- max(hist(var, breaks = 50, plot = FALSE)$counts) labelplot <- paste0("PropHead: ", round(prop * 100, 2), "%") text( x = mu, y = ylabel, labels = labelplot, cex = 0.8, pos = 4 ) legend( "right", legend = paste0("mu", i), col = c("red3"), lty = 2, cex = 0.8 ) if (isFALSE(stopit)) break var <- head } par(opar) ## ----hiddtable, echo=FALSE---------------------------------------------------- sum_table$mu <- round(sum_table$mu,4) sum_table$prop <- paste0(round(100*sum_table$prop,2),"%") knitr::kable(sum_table[!is.na(sum_table$mu),], row.names = FALSE) ## ----checkmethod-------------------------------------------------------------- ht_sample_par <- classIntervals(sample_par, style = "headtails") brks == ht_sample_par$brks print(ht_sample_par) ## ----examplesimp, fig.show='hold', fig.asp=.7--------------------------------- opar <- par(no.readonly = TRUE) par(mar = c(2, 2, 2, 1), cex = 0.8) pal1 <- c("wheat1", "wheat2", "red3") # Minimum: single break print(classIntervals(sample_par, style = "headtails", thr = 0)) plot( classIntervals(sample_par, style = "headtails", thr = 0), pal = pal1, main = "thr = 0" ) # Two breaks print(classIntervals(sample_par, style = "headtails", thr = 0.2)) plot( classIntervals(sample_par, style = "headtails", thr = 0.2), pal = pal1, main = "thr = 0.2" ) # Default breaks: 0.4 print(classIntervals(sample_par, style = "headtails")) plot(classIntervals(sample_par, style = "headtails"), pal = pal1, main = "thr = Default") # Maximum breaks print(classIntervals(sample_par, style = "headtails", thr = 1)) plot( classIntervals(sample_par, style = "headtails", thr = 1), pal = pal1, main = "thr = 1" ) par(opar) ## ----loadspdata, message=FALSE------------------------------------------------ library(spData) data(afcon, package = "spData") ## ----summspdata, fig.show='hold'---------------------------------------------- # Top10 knitr::kable(head(afcon[order(afcon$totcon, decreasing = TRUE),c("name","totcon")],10)) opar <- par(no.readonly = TRUE) par(mar = c(4, 4, 3, 1), cex = 0.8) hist(afcon$totcon, n = 20, main = "Histogram", xlab = "totcon", col = "grey50", border = NA, ) plot( density(afcon$totcon), main = "Distribution", xlab = "totcon", ) par(opar) ## ----breaksample,fig.show='hold'---------------------------------------------- brks_ht <- classIntervals(afcon$totcon, style = "headtails") print(brks_ht) #Same number of classes for "fisher" nclass <- length(brks_ht$brks) - 1 brks_fisher <- classIntervals(afcon$totcon, style = "fisher", n = nclass) print(brks_fisher) brks_quantile <- classIntervals(afcon$totcon, style = "quantile", n = nclass) print(brks_quantile) pal1 <- c("wheat1", "wheat2", "red3") opar <- par(no.readonly = TRUE) par(mar = c(2, 2, 2, 1), cex = 0.8) plot(brks_ht, pal = pal1, main = "headtails") plot(brks_fisher, pal = pal1, main = "fisher") plot(brks_quantile, pal = pal1, main = "quantile") par(opar) ## ----benchmarkbreaks, fig.show='hold', fig.width=7---------------------------- #Helper function to rescale values help_reescale <- function(x, min = 1, max = 10) { r <- (x - min(x)) / (max(x) - min(x)) r <- r * (max - min) + min return(r) } afcon$ecdf_class <- help_reescale(afcon$totcon, min = 1 - 0.5, max = nclass - 0.5) afcon$ht_breaks <- cut(afcon$totcon, brks_ht$brks, labels = FALSE, include.lowest = TRUE) afcon$fisher_breaks <- cut(afcon$totcon, brks_fisher$brks, labels = FALSE, include.lowest = TRUE) afcon$quantile_break <- cut(afcon$totcon, brks_quantile$brks, labels = FALSE, include.lowest = TRUE) opar <- par(no.readonly = TRUE) par(mar = c(4, 4, 1, 1), cex = 0.8) plot( density(afcon$ecdf_class), ylim = c(0, 0.8), lwd = 2, main = "", xlab = "class" ) lines(density(afcon$ht_breaks), col = "darkblue", lty = 2) lines(density(afcon$fisher_breaks), col = "limegreen", lty = 2) lines(density(afcon$quantile_break), col = "red3", lty = 2) legend("topright", legend = c("Continuous", "headtails", "fisher", "quantile"), col = c("black", "darkblue", "limegreen", "red3"), lwd = c(2, 1, 1, 1), lty = c(1, 2, 2, 2), cex = 0.8 ) par(opar) ## ----finalplot , fig.show='hold', fig.asp=1.2--------------------------------- custompal <- c("#FE9F6D99", "#DE496899", "#8C298199", "#3B0F7099", "#00000499") afcon$cex_points <- help_reescale(afcon$totcon, min = 1, max = 5) opar <- par(no.readonly = TRUE) par(mar = c(1.5, 1.5, 2, 1.5), cex = 0.8) # Plot continuous plot( x = afcon$x, y = afcon$y, axes = FALSE, cex = afcon$cex_points, pch = 20, col = "grey50", main = "Continuous", ) mcont <- (max(afcon$totcon) - min(afcon$totcon)) / 4 legcont <- 1:5 * mcont - (mcont - min(afcon$totcon)) legend("bottomleft", xjust = 1, bty = "n", legend = paste0(" ", round(legcont, 0) ), col = "grey50", pt.cex = seq(1, 5), pch = 20, title = "totcon" ) box() plot( x = afcon$x, y = afcon$y, axes = FALSE, cex = afcon$ht_breaks, pch = 20, col = custompal[afcon$ht_breaks], main = "headtails" ) legend( "bottomleft", xjust = 1, bty = "n", legend = paste0(" ", round(brks_ht$brks[2:6],0) ), col = custompal, pt.cex = seq(1, 5), pch = 20, title = "totcon" ) box() plot( x = afcon$x, y = afcon$y, axes = FALSE, cex = afcon$fisher_breaks, pch = 20, col = custompal[afcon$fisher_breaks], main = "fisher" ) legend( "bottomleft", xjust = 1, bty = "n", legend = paste0(" ", round(brks_fisher$brks[2:6],0) ), col = custompal, pt.cex = seq(1, 5), pch = 20, title = "totcon" ) box() plot( x = afcon$x, y = afcon$y, axes = FALSE, cex = afcon$quantile_break, pch = 20, col = custompal[afcon$quantile_break], main = "quantile" ) legend( "bottomleft", xjust = 1, bty = "n", legend = paste0(" ", round(brks_quantile$brks[2:6],0) ), col = custompal, pt.cex = seq(1, 5), pch = 20, title = "totcon" ) box() par(opar) ## ----distest, fig.show='hold'------------------------------------------------- #Init samples set.seed(2389) #Pareto distributions a=7 b=14 paretodist <- 7 / (1 - runif(5000000)) ^ (1 / 14) #Exponential dist expdist <- rexp(5000000) #Lognorm lognormdist <- rlnorm(5000000) #Weibull weibulldist <- rweibull(5000000, 1, scale = 5) #LogCauchy "super-heavy tail" logcauchdist <- exp(rcauchy(5000000, 2, 4)) #Remove Inf logcauchdist <- logcauchdist[logcauchdist < Inf] #Normal dist normdist <- rnorm(5000000) #Left-tailed distr leftnorm <- sample(rep(normdist[normdist < mean(normdist)], 3), size = 5000000) #Uniform distribution unifdist <- runif(5000000) ## ----testresults, fig.show='hold'--------------------------------------------- testresults <- data.frame( Title = NA, style = NA, nsample = NA, thresold = NA, nbreaks = NA, time_secs = NA ) benchmarkdist <- function(dist, style = "headtails", thr = 0.4, title = "", plot = FALSE) { init <- Sys.time() br <- classIntervals(dist, style = style, thr = thr) a <- Sys.time() - init test <- data.frame( Title = title, style = style, nsample = format(length(br$var), scientific = FALSE, big.mark = ","), thresold = thr, nbreaks = length(br$brks) - 1, time_secs = as.character(round(a,4)) ) testresults <- unique(rbind(testresults, test)) if (plot) { plot( density(br$var, from = quantile(dist,.0005), to = quantile(dist,.9995) ), col = "black", cex.main = .9, main = paste0( title, " ", style, ", thr =", thr, ", nbreaks = ", length(br$brks) - 1 ), ylab = "", xlab = "" ) abline(v = br$brks, col = "red3", lty = 2) } return(testresults) } opar <- par(no.readonly = TRUE) par(mar = c(2, 2, 2, 2), cex = 0.8) # Pareto---- testresults <- benchmarkdist(paretodist, title = "Pareto", plot = TRUE) testresults <- benchmarkdist(paretodist, title = "Pareto", thr = 0) testresults <- benchmarkdist(paretodist, title = "Pareto", thr = .75, plot = TRUE) #Sample 2,000 obs set.seed(1234) Paretosamp <- sample(paretodist, 2000) testresults <- benchmarkdist(Paretosamp, title = "Pareto sample", style = "fisher", plot = TRUE) testresults <- benchmarkdist(Paretosamp, title = "Pareto sample", style = "headtails", plot = TRUE) #Exponential---- testresults <- benchmarkdist(expdist, title = "Exponential", plot = TRUE) testresults <- benchmarkdist(expdist, title = "Exponential", thr = 0) testresults <- benchmarkdist(expdist, title = "Exponential", thr = 1) testresults <- benchmarkdist(expdist, title = "Exponential", style = "quantile", plot = TRUE) #Weibull----- testresults <- benchmarkdist(weibulldist, title = "Weibull", plot = TRUE) testresults <- benchmarkdist(weibulldist, title = "Weibull", thr = 0) testresults <- benchmarkdist(weibulldist, title = "Weibull", thr = 1) #Logcauchy testresults <- benchmarkdist(logcauchdist, title = "LogCauchy", plot = TRUE) testresults <- benchmarkdist(logcauchdist, title = "LogCauchy", thr = 0) testresults <- benchmarkdist(logcauchdist, title = "LogCauchy", thr = 1) #Normal---- testresults <- benchmarkdist(normdist, title = "Normal", plot = TRUE) testresults <- benchmarkdist(normdist, title = "Normal", thr = 0) testresults <- benchmarkdist(normdist, title = "Normal", thr = 1, plot = TRUE) #Truncated Left-tail Normal---- testresults <- benchmarkdist(leftnorm, title = "Left Normal", plot = TRUE) testresults <- benchmarkdist(leftnorm, title = "Left Normal", thr = -100) testresults <- benchmarkdist(leftnorm, title = "Left Normal", plot = TRUE, thr = 500) #Uniform---- testresults <- benchmarkdist(unifdist, title = "Uniform", plot = TRUE, thr = 0.7) testresults <- benchmarkdist(unifdist, title = "Uniform", thr = 0) testresults <- benchmarkdist(unifdist, title = "Uniform", plot = TRUE, thr = 1) par(opar) # Results knitr::kable(testresults[-1, ], row.names = FALSE)