## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

# we preload it to avoid ugly (was comiled with R.x.x) warnings in the doc
library(stringmagic)

# Option to allow caching in non interactive mode
options("string_magic_string_get_forced_caching" = TRUE)

## -----------------------------------------------------------------------------
cars = row.names(mtcars)
cat_magic("All cars from mtcars:\n{C, 60 swidth ? cars}")

# cars with an 'a', an 'e', an 'i', and an 'o', all in lower case
string_get(cars, "a & e & i & o")

# cars with no 'e' and at least one digit
string_get(cars, "!e & \\d")

# flags apply to all
# contains the 'words' 2, 9 or l
# alternative syntax for flags: "wi/2 | 9 | l"
string_get(cars, "word, ignore/2 | 9 | l")

## -----------------------------------------------------------------------------
# string_get(cars, "a & e & i & o")
# cars with an 'a', an 'e', an 'i', and an 'o', all in lower case
string_get(cars, "a", "e", "i", "o")

# string_get(cars, "!e & \\d")
# cars with no 'e' and at least one digit
string_get(cars, "!e", "\\d")

# string_get(cars, "!/e & \\d")
# This example cannot be replicated directly, we need to apply logical equivalence
string_get(cars, "!e", "!\\d", or = TRUE)

# string_get(cars, "wi/2 | 9 | l")
# contains the 'words' 2, 9 or l
string_get(cars, "2", "9", "l", or = TRUE, word = TRUE, ignore.case = TRUE)

## -----------------------------------------------------------------------------
# cars without digits, then cars with 2 'a's or 2 'e's and a digit
string_get(cars, "!\\d", "i/a.+a | e.+e & \\d", seq = TRUE)

# let's get the first word of each car name
car_first = string_ops(cars, "extract.first")
# we select car brands ending with 'a', then ending with 'i'
string_get(car_first, "a$", "i$", seq = TRUE)
# seq.unik is similar to seq but applies unique()
string_get(car_first, "a$", "i$", seq.unik = TRUE)

## -----------------------------------------------------------------------------
# Since we used `car_first` in the previous example, we don't need to provide
# it explicitly now
# => brands containing 'M' and ending with 'a' or 'i'; brands containing 'M'
string_get("M & [ai]$", "M", seq.unik = TRUE)

## -----------------------------------------------------------------------------
# parsing an input: extracting the numbers
input = "8.5in, 5.5, .5 cm"
string_ops(input, "','split, tws, '^\\. => 0.'replace, '^\\D+|\\D+$'replace, num")


# Explanation------------------------------------------------------------------|
# ','split: splitting w.r.t. ','                                               |
# tws: trimming the whitespaces                                                |
# '^\\. => 0.'replace: adds a 0 to strings starting with '.'                   |
# '^\\D+|\\D+$'replace: removes non-digits on both ends of the string          |
# num: converts to numeric                                                     |


# now extracting the units
string_ops(input, "','split, '^[ \\d.]+'replace, tws")


# Explanation------------------------------------------------------------------|
# ','split: splitting w.r.t. ','                                               |
# '^[ \\d.]+'replace: removes the ' ', digit                                   |
#                     and '.' at the beginning of the string                   |
# tws: trimming the whitespaces                                                |

## -----------------------------------------------------------------------------
# Now using the car data
cars = row.names(mtcars)

# let's get the brands starting with an "m"
string_ops(cars, "'i/^m'get, x, unik")


# Explanation------------------------------------------------------------------|
# 'i/^m'get: keeps only the elements starting with an m,                       |
#            i/ is the 'regex-flag' "ignore" to ignore the case                |
#            ^m means "starts with an m" in regex language                     |
# x: extracts the first pattern. The default pattern is "[[:alnum:]]+"         |
#    which means an alpha-numeric word                                         |
# unik: applies unique() to the vector                                         |


# let's get the 3 largest numbers appearing in the car models
string_ops(cars, "'\\d+'x, rm, unik, num, dsort, 3 first")


# Explanation------------------------------------------------------------------|
# '\\d+'x: extracts the first pattern, the pattern meaning "a succession"      |
#          of digits in regex language                                         |
# rm: removes elements equal to the empty string (default behavior)            |
# unik: applies unique() to the vector                                         |
# num: converts to numeric                                                     |
# dsort: sorts in decreasing order                                             |
# 3 first: keeps only the first three element                                  |

## -----------------------------------------------------------------------------
monologue = c("For who would bear the whips and scorns of time",
              "Th' oppressor's wrong, the proud man's contumely,",
              "The pangs of despis'd love, the law's delay,",
              "The insolence of office, and the spurns",
              "That patient merit of th' unworthy takes,",
              "When he himself might his quietus make",
              "With a bare bodkin? Who would these fardels bear,",
              "To grunt and sweat under a weary life,",
              "But that the dread of something after death-",
              "The undiscover'd country, from whose bourn",
              "No traveller returns- puzzles the will,",
              "And makes us rather bear those ills we have",
              "Than fly to others that we know not of?")

# Cleaning a text
string_clean(monologue, 
          # use string_magic to: lower the case and remove basic stopwords
          "@lower, stopword",
          # remove a few extra stopwords(we use the flag word 'w/')
          "w/th, 's",
          # manually stem some verbs
          "despis'd => despise", "undiscover'd => undiscover", "(m|t)akes => \\1ake",
          # still stemming: dropping the ending 's' for words of 4+ letters, except for quietus
          "(\\w{3,}[^u])s\\b => \\1",
          # normalizing the whitespaces + removing punctuation
          "@ws.punct")


## -----------------------------------------------------------------------------
fruits = string_vec("orange, apple, pineapple, strawberry")
fruits

## -----------------------------------------------------------------------------
more_fruits = string_vec("lemon, {fruits}, peach")
more_fruits

## -----------------------------------------------------------------------------
more_fruits = string_vec("lemon, {6 Shorten ? fruits}, peach")
more_fruits

## -----------------------------------------------------------------------------
pkgs = string_vec("pandas, os, time, re")
imports = string_vec("import numpy as np, import {pkgs}")
imports

## -----------------------------------------------------------------------------
string_vec("1, 5,
            3, 2,
            5, 12", .nmat = TRUE)

## -----------------------------------------------------------------------------
string_vec(1, 5,
           3, 2,
           5, 12, .nmat = 3)

## -----------------------------------------------------------------------------
# you can add the column names directly in the argument .df
df = string_vec("1, john,
                 3, marie,
                 5, harry", .df = "id, name")
df

# automatic conversion of numeric values
df$id * 5

## -----------------------------------------------------------------------------
x = c("Nor rain, wind, thunder, fire are my daughters.",
      "When my information changes, I alter my conclusions.")

# we split at each word
sentences_split = string_split2df(x, "[[:punct:] ]+")
sentences_split

# recovering the original vectors (we only lose the punctuation)
paste_conditional(sentences_split$x, sentences_split$obs)

## -----------------------------------------------------------------------------
id = c("ws", "jmk")
# we add the identifier
base_words = string_split2df(x, "[[:punct:] ]+", id = list(author = id))

# merging back using a formula
paste_conditional(x ~ author, base_words)