# IPDLN workshop: # Learning to create synthetic data # Edinburgh, 6th September 2022 ################# # PRACTICAL 1 # ################# # Data: # SD2011 # Quality of Life in Poland 2011 # Included in synthpop package (see help for SD2011) ##### Set up ###### # Remove all objects from the workspace rm(list = ls()) # Set working directory setwd("D:/IPDLNworkshop") ## replace with your own path # Load synthpop library(synthpop) # Examine and check data - especially missing values # and number of distinct values for categorical variables help(SD2011) ## or ?SD2011 codebook.syn(SD2011)$tab ## overall table codebook.syn(SD2011)$labs$placesize ## to see labels for this variable summary(SD2011) ##### Select a set of 7 variables ##### mydata <- SD2011[, c("sex", "age", "alcabuse", "smoke", "nociga", "height", "weight", "bmi")] codebook.syn(mydata)$tab summary(mydata) ##### Try default synthesis no worries and check results ##### syn_default <- syn(mydata, seed = 7676) #!! NOTE that in the next compare() command plot = FALSE to avoid waiting for #!! plots and press return. Set to TRUE (default) to see plots. compare(syn_default, mydata, plot = FALSE) ## looks OK utility.tables(syn_default, mydata, tables = "twoway") ## alcabuse and sex not good ##### Try stratifying by sex ##### syn_strat_default <- syn.strata(mydata, strata = "sex", seed = 5634) compare(syn_strat_default, mydata, plot = FALSE) ## looks OK utility.tables(syn_strat_default, mydata, tables = "twoway", ntabstoprint = 1) ## alcabuse and sex better ## alcabuse and age worst but not too bad ##### Now look at some details of bmi ##### # It should be calculated from height and weight # Check it in the real and synthetic data mydata$calcbmi <- mydata$weight/mydata$height^2*10000 head(mydata) mysyndata <- syn_strat_default$syn mysyndata$calcbmi <- mysyndata$weight/mysyndata$height^2 par(mfrow = 1:2) with(mydata, plot(bmi, calcbmi, col ="dark red", pch = 20, cex = 2)) with(mysyndata, plot(bmi, calcbmi, col ="dark red", pch = 20, cex = 2)) ## oops not so good par(mfrow = c(1, 1)) # something funny happening mydata <- mydata[, -9] ## drop calculated variable calcbmi # Turns out on looking at the data that there are some values of bmi where # height and/or weight are missing these look like data errors of # some sort and bmi ought to have been set to missing for them mydata$bmi[is.na(mydata$height) | is.na(mydata$weight)] mydata$bmi[is.na(mydata$height) | is.na(mydata$weight)] <- NA ## this gets rid of one huge one ##### Passive synthesis for bmi ##### mymethod <- syn_default$method mymethod mymethod["bmi"] <- "~I(weight/height^2*10000)" # Calculate bmi passively mymethod syn_strat_v2 <- syn.strata(mydata, strata = "sex", method = mymethod) summary(syn_strat_v2$syn) compare(syn_strat_v2, mydata) ## look OK utility.tables(syn_strat_v2, mydata, tables = "twoway", ntabstoprint = 1) ## alcabuse and sex better ## alcabuse and age worst but not too bad # you might want to run the code above that produced bmi plots for this new synthesis ##### Now check -8 values in nociga ##### # In case you want to define a cont.na for this variable with(mydata, table(nociga, smoke)[1:5,]) ## some -8s are smokers with(syn_strat_v2$syn, table(nociga, smoke)[1:5,]) ## similar in synthetic # Did not really need cont.na for this variable ##### Parametric ##### syn_para <- syn(mydata, method = "parametric") parmeth <- syn_para$method ## to allow changing parmeth parmeth["bmi"] <- "~I(weight/height^2*10000)" # Also -8 will be needed for nociga in a parametric synthesis syn_para_cont.na <- syn(mydata, method = parmeth, cont.na = list(nociga = -8)) compare(syn_para_cont.na, mydata, plot = FALSE) ## better utility.tables(syn_para_cont.na, mydata, tables = "twoway", ntabstoprint = 1) ## alcabuse and sex better ## alcabuse and age worst but not too bad ## Not too bad but cart was better so use that one for sdc() ##### Save data for cart models ##### syn_strat_v2_sdc <- sdc(syn_strat_v2, mydata, label = "false_data", rm.replicated.uniques = TRUE, recode.vars = c("age"), bottom.top.coding = c(NA, 80), recode.exclude = c(NA, -8)) mysyndata <- syn_strat_v2_sdc$syn ## Ready to save and send off table(mysyndata$age) head(mysyndata)