############ set up ################################################# # move to directory where you have saved the data file # setwd("D://sls/training/slsjune22/webpage") rm(list = ls()) ## clean out workspace load("shs.Rdata") library(synthpop) # # examine and check data - especially missing values # summary(shs) codebook.syn(shs)$tab ## overall table codebook.syn(shs)$labs$shs_6cla ## to see labels for this variable ########################################################################## ## note I have put plot = FALSE in compare so as not to have to wait for ## plots and press return. Remove this to see plots ###################### two different cart models ############################################################################### ### system.time(syn_cart <- syn(shs , seed = 6876), ) ## did not finish in 24 hours system.time(syn_cart <- syn(shs, method = "cart", visit.sequence = c(1,3:10,2), seed = 6876)) ## under 1 minute seconds compare(syn_cart,shs, plot = FALSE) utility.tables(syn_cart, shs, ntabstoprint =1) system.time(syn_ctree <- syn(shs,method = "ctree", visit.sequence = c(1,3:10,2), seed = 5680)) ## under 1 minute seconds compare(syn_cart,shs, plot = FALSE) utility.tables(syn_ctree, shs, ntabstoprint =1) ## ## how do these compare ## ## now look at models system.time(syn_ctree <- syn(shs,method = "ctree", visit.sequence = c(1,3:10,2) , seed = 5684, models = TRUE)) ## under 30 seconds plot(syn_ctree$models$groupinc) plot(syn_ctree$models$emp_sta) ## ## intuse variables OK? ## with(shs, table(hours_int, int_other, useNA = "ifany")) with(shs, table(hours_int, intuse, useNA = "ifany")) with(syn_ctree$syn, table(hours_int, int_other, useNA = "ifany")) with(syn_cart$syn, table(hours_int, int_other, useNA = "ifany")) ## ## change parameters increasing minbucket gives smaller models ## system.time(syn_cart2 <- syn(shs, method = "cart", visit.sequence = c(1,3:10,2), cart.minbucket = 10000, seed = 6876)) ## 4 seconds compare(syn_cart2,shs, plot = FALSE) ## not too bad utility.tables(syn_cart2, shs, ntabstoprint =1) ## worse # # rules no longer respected with(syn_cart2$syn, table(hours_int, intuse, useNA = "ifany")) ## ############################ two methods of fixing rules ########################################## ## 1 stratify ############################################# # system.time(syn_cart2.strata <- syn.strata(shs, method = "cart", strata = "intuse" , visit.sequence = c(1,3:10,2), cart.minbucket = 10000, seed = 6876)) ## 4 seconds compare(syn_cart2.strata,shs, plot = FALSE) ## OK utility.tables(syn_cart2.strata, shs) ## still not too good but better # # but at least the rules are OK with(syn_cart2.strata$syn, table(hours_int, intuse, useNA = "ifany")) with(syn_cart2.strata$syn, table(hours_int, int_grocery, useNA = "ifany")) with(syn_cart2.strata$syn, table(hours_int, int_other, useNA = "ifany")) # ## or 2 by defining rules######################################### # intrules <- list(hours_int = "intuse == 'no'", int_other = "intuse == 'no'",int_grocery = "intuse == 'no'") intvalues <- list(hours_int = "none", int_other = "no internet", int_grocery = "no internet") system.time(syn_cart2.rules <- syn(shs, method = "cart", rules = intrules, rvalues = intvalues , visit.sequence = c(1,3:10,2), cart.minbucket = 10000, seed = 6876)) ## 3 seconds with(syn_cart2.rules$syn, table(hours_int, intuse, useNA = "ifany")) with(syn_cart2.rules$syn, table(int_grocery, intuse, useNA = "ifany")) with(syn_cart2.rules$syn, table(int_other, intuse, useNA = "ifany")) # ############################## parametric models #######################s#################### ##system.time(syn_para <- syn(shs,method = "parametric", visit.sequence = c(1,3:10,2))) ## 1.65 hours # too slow so commented out # ## drop first two variables to speed up # shs_v2 <- shs[,-(1:2)] system.time(syn_para <- syn(shs_v2,method = "parametric", models = TRUE)) ## just over 1 min names(syn_para$models) syn_para$models[[8]] ## look at a model compare(syn_para,shs, plot = FALSE) ## not too bad utility.tables(syn_para,shs) ## relationships with age bad # # rules are almost OK except for 1 record could tidy using methods above # with(syn_para$syn, table(hours_int, intuse, useNA = "ifany")) with(syn_para$syn, table(hours_int, int_other, useNA = "ifany")) with(syn_para$syn, table(hours_int, int_grocery, useNA = "ifany")) ############################## catall ########################################### # system.time(syn_catall <- syn(shs ,method = "catall")) ## failed try again system.time(syn_catall <- syn(shs ,method = "catall", numtocat = "age", catall.priorn = 100)) ## 13seconds) compare(syn_catall, shs, plot = FALSE) ## OK utility.tables(syn_catall,shs, ntabstoprint = 1) ## OK too # # with(syn_catall$syn, table(hours_int, intuse, useNA = "ifany")) ### but rules broken a bit with(syn_catall$syn, table(int_grocery, intuse, useNA = "ifany")) with(syn_catall$syn, table(int_other, intuse, useNA = "ifany")) # # rules broken though numbers quite small # new rerun with structural zeros # sz <- list( intuse_hours_int = list( intuse = "no", hours_int = 1:5), intuse_int_grocery = list( intuse = "no", int_grocery = 1:2), intuse_int_other = list( intuse = "no", int_other = 1:2), hours_int_intuse = list( hours_int = "none", intuse = 2), int_grocery_intuse = list( int_grocery = "no internet", intuse = 2), int_other_intuse = list( int_other = "no internet", intuse = 2) ) system.time(syn_catall.sz <- syn(shs ,method = "catall", catall.structzero = sz, numtocat = "age", catall.priorn = 100)) ## 13seconds) with(syn_catall.sz$syn, table(hours_int, intuse, useNA = "ifany")) with(syn_catall.sz$syn, table(int_grocery, intuse, useNA = "ifany")) with(syn_catall.sz$syn, table(int_other, intuse, useNA = "ifany")) # # now rules OK # ########################### DP ########################################## # # needs to be grouped before synthesis or final data will not be DP # shs_cat <- numtocat.syn(shs)$data summary(shs_cat) system.time(syn_catalle2 <- syn(shs_cat , method = "catall", catall.epsilon =2 )) compare(syn_catalle2, shs_cat, plot = FALSE) utility.tables(syn_catalle2,shs_cat) ## terrible check scale # # check rules # with(syn_catalle2$syn, table(hours_int, intuse, useNA = "ifany")) with(syn_catalle2$syn, table(int_grocery, intuse, useNA = "ifany")) with(syn_catalle2$syn, table(int_other, intuse, useNA = "ifany")) # # all bad try setting structural zeros # ######################### with structural zeros ################################ system.time(syn_catalle2.sz <- syn(shs_cat , method = "catall",catall.structzero = sz, catall.epsilon = 2 )) compare(syn_catalle2.sz,shs_cat, plot = FALSE)### still very bad utility.tables(syn_catalle2.sz,shs_cat, ntabstoprint = 1) ##### still very bad # with(syn_catalle2.sz$syn, table(hours_int, intuse, useNA = "ifany")) with(syn_catalle2.sz$syn, table(int_grocery, intuse, useNA = "ifany")) with(syn_catalle2.sz$syn, table(int_other, intuse, useNA = "ifany")) # # but now at least rules are respected you could try different epsilon # ##################################################################################