## ----setup, include = FALSE--------------------------------------------------- if (rlang::is_installed(c("modeldata"))) { run <- TRUE } else { run <- FALSE } knitr::opts_chunk$set( eval = run, collapse = TRUE, comment = "#>" ) ## ----message=FALSE------------------------------------------------------------ library(recipes) library(textrecipes) library(modeldata) data("tate_text") ## ----------------------------------------------------------------------------- words <- c("or", "and", "on") okc_rec <- recipe(~., data = tate_text) %>% step_tokenize(medium) %>% step_stopwords(medium, custom_stopword_source = words, keep = TRUE) %>% step_tf(medium) okc_obj <- okc_rec %>% prep() bake(okc_obj, tate_text) %>% select(starts_with("tf_medium")) ## ----------------------------------------------------------------------------- stopwords_list <- c( "was", "she's", "who", "had", "some", "same", "you", "most", "it's", "they", "for", "i'll", "which", "shan't", "we're", "such", "more", "with", "there's", "each" ) words <- c("sad", "happy") okc_rec <- recipe(~., data = tate_text) %>% step_tokenize(medium) %>% step_stopwords(medium, custom_stopword_source = stopwords_list) %>% step_stopwords(medium, custom_stopword_source = words) %>% step_tfidf(medium) okc_obj <- okc_rec %>% prep() bake(okc_obj, tate_text) %>% select(starts_with("tfidf_medium")) ## ----------------------------------------------------------------------------- okc_rec <- recipe(~., data = tate_text) %>% step_tokenize(medium, token = "characters") %>% step_stopwords(medium, custom_stopword_source = letters, keep = TRUE) %>% step_tf(medium) okc_obj <- okc_rec %>% prep() bake(okc_obj, tate_text) %>% select(starts_with("tf_medium")) ## ----------------------------------------------------------------------------- okc_rec <- recipe(~., data = tate_text) %>% step_tokenize(medium, token = "words") %>% step_stem(medium) %>% step_untokenize(medium) %>% step_tokenize(medium, token = "ngrams") %>% step_tokenfilter(medium, max_tokens = 500) %>% step_tfidf(medium) okc_obj <- okc_rec %>% prep() bake(okc_obj, tate_text) %>% select(starts_with("tfidf_medium"))