---
title: "Examples with wiki_utils"
author: "Angel Zazo, Department of Computer Science and Automatics, University of Salamanca"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{wikiTools package}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

## Functions

### Functions to obtain a list of Wikidata entities

w_SearchByLabel(string, mode='inlabel', langs="", langsorder='',
                instanceof="", Pproperty="", debug=FALSE)

w_SearchByOccupation(Qoc, mode=c('default','count','wikipedias'),
                     langsorder='', wikilangs='', nlimit=10000, debug=FALSE)

### Function to obtain information from a list of Wikidata entities or a single one.

w_isInstanceOf(entity_list, instanceof='', nlimit=50000, debug=FALSE)

w_Wikipedias(entity_list, wikilangs="", instanceof='', nlimit=1500, debug=FALSE)

w_isValid(entity_list, nlimit=50000, debug=FALSE)

w_Property(entity_list, Pproperty, includeQ=FALSE, langsorder='en',
           nlimit=10000, debug=FALSE)

w_SearchByAuthority(Pauthority, langsorder='', instanceof='',
                    nlimit=10000, debug=FALSE)

Pauthority = Authority Database Property in Wikidata

w_EntityInfo(entity_list, mode='default', langsorder='',
             wikilangs="",  nlimit=MW_LIMIT, debug=FALSE)

### Functions to obtain information using the WikiMedia API's

m_Opensearch(string, project='en.wikipedia.org',
             profile="engine_autoselect", redirects="resolve")

m_reqMediaWiki(titles,
               mode=c('wikidataEntity','redirects','pagePrimaryImage','pageFiles'),
               project='en.wikipedia.org', redirects=TRUE, exclude_ext='svg|webp|xcf')

m_Pageviews(article, start, end, project="en.wikipedia.org",
            access="all-access", agent="user",
            granularity="monthly", redirects=FALSE)

m_XtoolsInfo(article, infotype=c("articleinfo", "prose", "links"),
             project="en.wikipedia.org", redirects=FALSE)

### Functions to obtain information (viafID or cluster records) using the VIAF API

v_AutoSuggest(author) : obtains viafID

v_Search(CQL_Query, mode=c('default', 'anyField', 'allmainHeadingEl',
                           'allNames', 'allPersonalNames', 'allTitle'),
         schema=c('brief', 'JSON')) : obtains clusters records

### Function to retrieve a cluster record using the viafID.

v_GetRecord(viafid, record_format='viaf.json'): retrieve a cluster record

### Function to extract information from a VIAF cluster record

v_Extract(viaf, info, source=NULL)



## Package installation and loading

To install and load the updated version of the wikiTools package simply run the following commands:
```{r eval=FALSE}
install.packages("wikiTools")
```
```{r echo=TRUE}
library(wikiTools)
```



## Examples of Wikidata functions using WDQS

#### Search string "Iranzo" in different positions
Exact search in Label or exact search in AltLabel (case sensitive and diacritics)

Optional: limit by instanceof Wikidata class (Qxx).

Optional: return information of some properties (Pproperties, Pxxx).
```{r echo=TRUE}
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en')
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5')
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5|Q101352')
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5',
                      Pproperty = 'P21|P569|P570')
```

#### Search at the beginning in Label or AltLabel (diacritics and case are ignored)
```{r echo=TRUE}
df <- w_SearchByLabel(string='Iranzo', lang='en', langsorder='es|en', mode='startswith')
df <- w_SearchByLabel(string='Iranzo', lang='en', langsorder='es|en', instanceof = 'Q5',
                      mode='startswith')
df <- w_SearchByLabel(string='Iranzo', lang='en', langsorder='es|en',
                      instanceof = 'Q5|Q101352', mode='startswith')
df <- w_SearchByLabel(string='Iranzo', lang='en', langsorder='en', instanceof = 'Q5',
                      Pproperty = 'P21|P569|P570', mode='startswith')
```

#### Search in any position in Label or AltLabel (diacritics and case are ignored)
If lang=='' search in any language, else the search is performed only in the language indicated.
```{r echo=TRUE}
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', mode='inlabel')
```
Search only in Chinese (Simplified) (language code: zh):
```{r echo=TRUE}
df <- w_SearchByLabel(string='Iranzo', langsorder='zh|es', lang='zh', mode='inlabel')
```
Optional instanceof and Property
```{r echo=TRUE}
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5',
                      mode='inlabel')
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5|Q101352',
                      mode='inlabel')
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5',
                      Pproperty = 'P21|P569|P570', mode='inlabel')
```

#### aux: getting a vector of entities (`l`) to use later.
```{r echo=TRUE}
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', mode='inlabel')
l <- df$entity
```

#### w_isInstanceOf
Check if elements in entity_list are instance of a Wikimedia class
```{r echo=TRUE}
df <- w_isInstanceOf(entity_list=l, instanceof='Q5')
# Not TRUE
df[!df$instanceof_Q5,]
```

#### w_Wikipedias
Search for Wikipedia pages in all/some languages

Optional: instanceOF (limit to entities which are instance of a Wikidata class)
```{r echo=TRUE}
df <- w_Wikipedias(entity_list=l)
df <- w_Wikipedias(entity_list=l, wikilangs='es|en|fr')
df <- w_Wikipedias(entity_list=l, wikilangs='es|en|fr', instanceof="Q5")
```

#### w_SearchByOccupation
Count entities, or get the entities with that occupation, also get Wikipedia pages

Note: depending on connection speed, nlimit parameter musts be adjusted
```{r echo=TRUE}
w_SearchByOccupation(Qoc="Q2306091", mode='count') # "Q2306091" Qoc for Sociologist
q <- w_SearchByOccupation(Qoc="Q2306091")
l <- q$entity
```

```{r echo=TRUE, eval=FALSE}
lw <- w_SearchByOccupation(Qoc='Q2306091', mode='wikipedias') # lw=dataframe
# We can obtain the same information using previous function w_Wikipedias:
lw2 <- w_Wikipedias(entity_list=l)
# Verifying:
all(lw['Q10320558','pages'] == lw2['Q10320558','pages'])
# Verifying:
all(sort(strsplit(lw['Q9061', 'pages'], '|', fixed = T)[[1]]) ==
    sort(strsplit(lw2['Q9061', 'pages'], '|', fixed = T)[[1]]))
```

#### w_isValid.
Check if the Wikidata entities are valid. A entity is valid if it has a label
or has a description. If one entity exists but is not valid, is possible that
it has a redirection to other entity, in that case, the redirection is
obtained. Other entities may have existed in the past, but they are currently
deleted.
```{r echo=TRUE}
l2 <- append(l, c("Q115637688", "Q105660123"))  # Note: adding two new entities
v <- w_isValid(l2)
# Not valid
v[!v$valid,]
```

#### w_Property
Obtain properties of entity_list.
```{r echo=TRUE}
p <- w_Property(l, Pproperty = 'P21|P569|P214', langsorder = 'es|en')
```

#### w_SearchByAuthority
Search for Wikidata entities that have an identifier in the Wikidata authority property "Pauthority".

Optional: instanceOf

Example: Pauthority=P4439 (has identifier in the Museo Nacional Centro de Arte Reina Sofía)
```{r echo=TRUE}
mncars   <- w_SearchByAuthority(Pauthority="P4439", langsorder = 'es|en')
# 1286  [human, groups, etc.]
mncarsQ5 <- w_SearchByAuthority(Pauthority="P4439", langsorder = 'es|en',
                                     instanceof = 'Q5')  # 1280
# Entities are not 'human' (Q5) [see entityDescription column):
mncars[!(mncars$entity %in% mncarsQ5$entity),]  # not instance of Q5.
```

#### w_EntityInfo
Get some properties of a Wikidata entity.
```{r echo=TRUE}
df <- w_EntityInfo(entity_list='Q134644', langsorder='es|en')
df <- w_EntityInfo(entity_list='Q134644', langsorder='es|en', wikilangs='es|en|fr')
df <- w_EntityInfo(c('Q270510', 'Q1675466', 'Q24871'), mode='film', langsorder='es|en', wikilangs='es|en|fr')
# Search string 'abba' inlabel
w <- w_SearchByLabel('abba', mode='inlabel', langsorder = '', instanceof = 'Q5')
df <- w_EntityInfo(w$entity, langsorder='en', wikilangs='en|es|fr', debug='info')
# Search 3D films
w <- w_SearchByInstanceof(instanceof='Q229390', langsorder = 'en|es', debug = 'info')
df <- w_EntityInfo(w$entity, mode="film", langsorder='en', wikilangs='en', debug='info')
```


## Examples of WikiMedia functions

#### m_Opensearch
Search articles that contains any words (note: it is better to use a large string)

Some search profiles:
```{r echo=TRUE}
df <- m_Opensearch(string='Duque de Alba', project='es.wikipedia.org',
                   profile="engine_autoselect", redirects="resolve")
df <- m_Opensearch(string='Duque de Alba', project='es.wikipedia.org', profile="strict")
df <- m_Opensearch(string='Duque de Alba', project='es.wikipedia.org', profile="fuzzy")
```

#### m_reqMediaWiki
Checks if titles are in a Wikimedia project and returns the Wikidata entity for them, if they have one.

Note that URLdecode("a%CC%8C") is the letter "a" with the combining caron (ÇŽ)
```{r echo=TRUE}
df <- m_reqMediaWiki(c('Max Planck', URLdecode("a%CC%8C"), 'Max', 'Cervante', 'humanist'),
                        mode='wikidataEntity', project='en.wikipedia.org')
```

Obtains the redirections of a page (the page itself can be a redirect to other page).

Returns a vector for each title, in each vector the first element is the destiny, rest are all pages that redirect to it.
```{r echo=TRUE}
a <- m_reqMediaWiki(c('Cervantes', 'Planck', 'Noexiste'), mode='redirects',
                    project='es.wikipedia.org')
a
```

Gets the URL of de Primary image as a URL of Wikimedia pages.

Gets all URL of files inserted in the pages (images, sounds, videos...), using '|' as separator, and excluding some extensions in the exclude_ext parameter.

Both functions automatically resolve redirects (the destiny is the "normalized" column of the data-frame returned).
```{r echo=TRUE}
i <- m_reqMediaWiki(c('Max Planck', URLdecode("a%CC%8C"), 'Max', 'Cervante', 'humanist'),
                  mode='pagePrimaryImage')

f <- m_reqMediaWiki(c('Max Planck', URLdecode("a%CC%8C"), 'Max', 'Cervante', 'humanist'),
                  mode='pageFiles', exclude_ext = "svg|webp|xcf")
```

#### m_Pageviews
Gets visits that a page have had in a date interval

Optional: redirects
```{r echo=TRUE}
v <-  m_Pageviews(article="Cervantes", start="20230101", end="20230501",
                   project="es.wikipedia.org", granularity="monthly")
vv <- m_Pageviews(article="Cervantes", start="20230101", end="20230501",
                   project="es.wikipedia.org", granularity="monthly",
                   redirects=TRUE)
```

#### m_XtoolsInfo
Obtains information (as vector) about an article in the Wikimedia project.

Infotype: articleinfo, prose, links

Optional: redirects
```{r echo=TRUE}
x <-  m_XtoolsInfo(article="Cervantes", infotype="articleinfo", project="es.wikipedia.org")
xx <- m_XtoolsInfo(article="Cervantes", infotype="articleinfo", project="es.wikipedia.org",
                   redirects=TRUE)

y <-  m_XtoolsInfo(article="Miguel de Cervantes", infotype="links", project="es.wikipedia.org")
yy <- m_XtoolsInfo(article="Cervantes", infotype="links", project="es.wikipedia.org",
                    redirects=TRUE)
```

Gets all information (articleinfo, prose, links).
```{r echo=TRUE}
z  <- m_XtoolsInfo(article="Miguel de Cervantes", infotype="all", project="es.wikipedia.org")
zz <- m_XtoolsInfo(article="Cervantes", infotype="all", project="es.wikipedia.org",
                       redirects=TRUE)
```

## Examples using VIAF functions

#### v_AutoSuggest
Searches authors. Sometimes the same author appears several times, under a different name).

Return a data-frame.

Important: The API returns a maximum of 10 records.
```{r echo=TRUE}
v_AutoSuggest('Iranzo')
v_AutoSuggest('Esparza, María')
v_AutoSuggest('Escobar, Modesto')
# Note that four rows are returned, but only two different viafids.
```

#### v_Search
Search using CQL_Query

```{r echo=TRUE}
# Auxiliary function that extracts specific information from each record.
showVIAF <- function(r) {
  i <- 0
  for (j in r) {
    i <- i+1
    # Get viaf record
    viaf <- j$record$recordData
    viafid <- viaf$viafID
    cat(paste0("-----------\nRecord #",i,"\nSources:\n"))
    print(v_Extract(viaf, info='sources'))
    cat("Gender: ");       print(v_Extract(viaf, info='gender'))
    cat("Dates: ") ;       print(v_Extract(viaf, info='dates'))
    cat('Occupations: ');  print(v_Extract(viaf, info='occupations'))
    cat("Titles: ");       print(v_Extract(viaf, info='titles'))
    cat("Wikipedias: ");   print(v_Extract(viaf, info='wikipedias'))
  }
}
```

Search in any field (cql.any)

Operator is "=": so search all terms and only those ones:
```{r echo=TRUE}
CQL_Query <- 'cql.any = "García Iranzo, Juan"'
r <- v_Search(CQL_Query)
# r contains complete VIAF records (sometimes seen as a "cluster record",
# which is unified by combining records from many libraries around the world)
showVIAF(r)
```

Shortcut
```{r echo=TRUE}
r <- v_Search("García Iranzo, Juan", mode="anyField")
showVIAF(r)
```

Search in 1xx, 4xx, 5xx fields of MARC record (local.names)

Operator is "all": search all terms
```{r echo=TRUE}
CQL_Query <- 'local.names all "Figuerola"'
r <- v_Search(CQL_Query)
```

Shortcut
```{r echo=TRUE}
r2 <- v_Search("Figuerola", mode="allNames")
cat(length(r), length(r2))
```

Records found exceeds the maximum per request API limit : 250
```{r echo=TRUE}
CQL_Query <- 'local.names all "Bolero"'
r <- v_Search(CQL_Query)
```

Search in 100, 400, 500 fields of MARC record (local.personalNames)

Operator is "all": search all terms
```{r echo=TRUE}
CQL_Query <- 'local.personalNames all "Modesto Escobar"'
r <- v_Search(CQL_Query)
showVIAF(r)
```

v_Search mode=allmainHeadingEl: 1xx fields of MARC record:
```{r echo=TRUE}
r <- v_Search("Escobar Mercado, Modesto", mode='allmainHeadingEl')
```

Search in Titles
```{r echo=TRUE}
CQL_Query <- 'local.title all "Los pronósticos electorales con encuestas"'
r <- v_Search(CQL_Query)
showVIAF(r)
```