## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
options(rmarkdown.html_vignette.check_title = FALSE)
library(tidyverse)

## ----checkIDs-----------------------------------------------------------------
library(BGmisc)

# Load our example dataset
df <- ped2fam(potter, famID = "newFamID", personID = "personID")

# Check for ID issues
checkIDs(df, repair = FALSE)

## ----datamade-----------------------------------------------------------------
# Create our problematic dataset
df_duplicates <- df
# Sibling ID conflict
df_duplicates$personID[df_duplicates$name == "Vernon Dursley"] <-
  df_duplicates$personID[df_duplicates$name == "Marjorie Dursley"]
# Duplicate entry
df_duplicates <- rbind(
  df_duplicates,
  df_duplicates[df_duplicates$name == "Dudley Dursley", ]
)

## -----------------------------------------------------------------------------
library(tidyverse)

summarizeFamilies(df_duplicates,
  famID = "newFamID",
  personID = "personID"
)$family_summary %>%
  glimpse()

## -----------------------------------------------------------------------------
# Identify duplicates
result <- checkIDs(df_duplicates)
print(result)

## -----------------------------------------------------------------------------
# Let's examine the problematic entries
df_duplicates %>%
  filter(personID %in% result$non_unique_ids) %>%
  arrange(personID)

## -----------------------------------------------------------------------------
df_repair <- checkIDs(df, repair = TRUE)

df_repair %>%
  filter(ID %in% result$non_unique_ids) %>%
  arrange(ID)

result <- checkIDs(df_repair)

print(result)

## ----within-------------------------------------------------------------------
# Create a sample dataset with within-person duplicate parent IDs

df_within <- ped2fam(potter, famID = "newFamID", personID = "personID")

df_within$momID[df_within$name == "Vernon Dursley"] <- df_within$personID[df_within$name == "Vernon Dursley"]

# Check for within-row duplicates
result <- checkIDs(df_within, repair = FALSE)
print(result)

## -----------------------------------------------------------------------------
# Find the problematic entry

df_within[df_within$momID %in% result$is_own_mother_ids, ]

## -----------------------------------------------------------------------------
# Validate sex coding

results <- checkSex(potter,
  code_male = 1,
  code_female = 0,
  verbose = TRUE, repair = FALSE
)
print(results)

## -----------------------------------------------------------------------------
# Repair sex coding
df_fix <- checkSex(potter,
  code_male = 1,
  code_female = 0,
  verbose = TRUE, repair = TRUE
)
print(df_fix)

## ----eval = FALSE-------------------------------------------------------------
# # note, is broken right now
# # Load necessary libraries and datasets
# library(tidyverse)
# library(BGmisc)
# set.seed(123)
# # Create a sample dataset similar to the one used in Mason's approach
# sample_data <- data.frame(
#   ID = 1:10,
#   name = c("Person1", "Person2", "Person3", "Person4", "Person5", "Person6", "Person7", "Person8", "Person9", "Person10"),
#   dadID = c(NA, NA, 1, 1, 3, 3, 5, 5, 7, 7),
#   momID = c(NA, NA, 2, 2, 4, 4, 6, 6, 7, 8),
#   sex = c(1, 0, 1, 0, 1, 0, 1, 0, 1, 0),
#   byr = runif(10, 1900, 2000),
#   dyr = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)
# )
# 
# 
# 
# summarizePedigrees(sample_data)
# 
# 
# # Clean the sample dataset
# cleaned_data <- sample_data %>%
#   janitor::remove_empty(c("rows", "cols")) %>%
#   mutate(
#     sex_factor = as.factor(case_when(sex == 1 ~ "male", sex == 0 ~ "female"))
#   )
# 
# # Check for duplicate IDs
# temp_check <- checkIDs(cleaned_data, verbose = TRUE, repair = FALSE)
# all_duplicated_ids <- cbind(temp_check$non_unique_ids, temp_check$duplicated_parents_ids)
# 
# cleaned_data <- cleaned_data %>%
#   mutate(
#     duplicated = case_when(ID %in% temp_check$non_unique_ids ~ 1, TRUE ~ 0),
#     duplicated_parent = case_when(dadID %in% all_duplicated_ids | momID %in% all_duplicated_ids ~ 1, TRUE ~ 0),
#     duplicated_source_ID = case_when(ID %in% all_duplicated_ids ~ ID, dadID %in% all_duplicated_ids ~ dadID, momID %in% all_duplicated_ids ~ momID, TRUE ~ NA_integer_),
#     alteredlinks = 0
#   )
# 
# # Display and manually correct specific errors
# cleaned_data %>%
#   filter(duplicated == 1 | duplicated_parent == 1) %>%
#   arrange(duplicated_source_ID, ID) %>%
#   print(n = Inf)
# 
# # Perform specific corrections
# cleaned_data <- cleaned_data %>%
#   mutate(
#     alteredlinks = case_when(ID == 9 ~ 1, TRUE ~ alteredlinks),
#     ID = case_when(ID == 7 & round(byr, digits = 0) == 2020 ~ ID + 1e6, TRUE ~ ID)
#   )
# 
# # Final check for remaining duplicates
# final_check <- checkIDs(cleaned_data, verbose = TRUE, repair = FALSE)
# print(final_check)