## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup-------------------------------------------------------------------- library(FlexRL) ## ----------------------------------------------------------------------------- df2016 = read.csv("exA.csv", row.names = 1) df2020 = read.csv("exB.csv", row.names = 1) ## ----------------------------------------------------------------------------- head(df2016) ## ----------------------------------------------------------------------------- head(df2020) ## ----------------------------------------------------------------------------- PIVs_config = list( ANASCI = list(stable = TRUE), SESSO = list(stable = TRUE), STACIV = list(stable = TRUE), STUDIO = list(stable = TRUE), IREG = list(stable = TRUE) ) PIVs = names(PIVs_config) PIVs_stable = sapply(PIVs_config, function(x) x$stable) ## ----------------------------------------------------------------------------- for(i in 1:length(PIVs)){ intersect_support_piv = intersect( unique(df2016[,PIVs[i]]), unique(df2020[,PIVs[i]]) ) df2016 = df2016[df2016[,PIVs[i]] %in% c(NA,intersect_support_piv),] df2020 = df2020[df2020[,PIVs[i]] %in% c(NA,intersect_support_piv),] } rownames(df2016) = 1:nrow(df2016) rownames(df2020) = 1:nrow(df2020) ## ----------------------------------------------------------------------------- links = intersect(df2016$ID, df2020$ID) Nlinks = length(links) TrueDelta = data.frame( matrix(0, nrow=0, ncol=2) ) for (i in 1:Nlinks) { id = links[i] id16 = which(df2016$ID == id) id20 = which(df2020$ID == id) TrueDelta = rbind(TrueDelta, cbind(rownames(df2016[id16,]),rownames(df2020[id20,]))) } true_pairs = do.call(paste, c(TrueDelta, list(sep="_"))) ## ----------------------------------------------------------------------------- nrow(df2016) ## ----------------------------------------------------------------------------- nrow(df2020) ## ----------------------------------------------------------------------------- Nlinks ## ----------------------------------------------------------------------------- colSums(df2016[TrueDelta[,1],PIVs] == df2020[TrueDelta[,2],PIVs]) / Nlinks ## ----------------------------------------------------------------------------- colSums(is.na(df2016[TrueDelta[,1],PIVs])) / Nlinks ## ----------------------------------------------------------------------------- colSums(is.na(df2020[TrueDelta[,2],PIVs])) / Nlinks ## ----------------------------------------------------------------------------- df2016$source = "df2016" df2020$source = "df2020" if(nrow(df2020)>nrow(df2016)){ encodedA = df2016 encodedB = df2020 cat("df2020 is the largest file, denoted encodedB") }else{ encodedB = df2016 encodedA = df2020 cat("df2016 is the largest file, denoted encodedB") } levels_PIVs = lapply(PIVs, function(x) levels(factor(as.character(c(encodedA[,x], encodedB[,x]))))) for(i in 1:length(PIVs)) { encodedA[,PIVs[i]] = as.numeric(factor(as.character(encodedA[,PIVs[i]]), levels=levels_PIVs[[i]])) encodedB[,PIVs[i]] = as.numeric(factor(as.character(encodedB[,PIVs[i]]), levels=levels_PIVs[[i]])) } nvalues = sapply(levels_PIVs, length) names(nvalues) = PIVs encodedA[,PIVs][ is.na(encodedA[,PIVs]) ] = 0 encodedB[,PIVs][ is.na(encodedB[,PIVs]) ] = 0 ## ----------------------------------------------------------------------------- nvalues ## ----------------------------------------------------------------------------- data = list( A = encodedA, B = encodedB, Nvalues = nvalues, PIVs_config = PIVs_config, controlOnMistakes = c(TRUE, TRUE, FALSE, FALSE, FALSE), sameMistakes = TRUE, phiMistakesAFixed = FALSE, phiMistakesBFixed = FALSE, phiForMistakesA = c(NA, NA, NA, NA, NA), phiForMistakesB = c(NA, NA, NA, NA, NA) ) fit = FlexRL::stEM( data = data, StEMIter = 50, StEMBurnin = 30, GibbsIter = 100, GibbsBurnin = 70, musicOn = TRUE, newDirectory = NULL, saveInfoIter = FALSE ) ## ----------------------------------------------------------------------------- DeltaResult = fit$Delta colnames(DeltaResult) = c("idx20","idx16","probaLink") DeltaResult = DeltaResult[DeltaResult$probaLink>0.5,] DeltaResult ## ----------------------------------------------------------------------------- results = data.frame( Results=matrix(NA, nrow=6, ncol=1) ) rownames(results) = c("tp","fp","fn","f1score","fdr","sens.") if(nrow(DeltaResult)>1){ linked_pairs = do.call(paste, c(DeltaResult[,c("idx16","idx20")], list(sep="_"))) truepositive = length( intersect(linked_pairs, true_pairs) ) falsepositive = length( setdiff(linked_pairs, true_pairs) ) falsenegative = length( setdiff(true_pairs, linked_pairs) ) precision = truepositive / (truepositive + falsepositive) fdr = 1 - precision sensitivity = truepositive / (truepositive + falsenegative) f1score = 2 * (precision * sensitivity) / (precision + sensitivity) results[,"FlexRL"] = c(truepositive,falsepositive,falsenegative,f1score,fdr,sensitivity) } results ## ----------------------------------------------------------------------------- colSums(encodedA[DeltaResult[,"idx20"],PIVs] == encodedB[DeltaResult[,"idx16"],PIVs]) / nrow(DeltaResult)