############ setup ############################# setwd("D:\\sls\\training\\slsjune22\\webpage") dir() rm (list = ls()) load("acs2018.Rdata") dim(acs2018) names(acs2018) library(synthpop) codebook.syn(acs2018)$tab summary(acs2018) # # suggests putting puma at the end or dropping it for comparisons # #system.time(syn_cart <- syn(acs2018,seed =3456, # visit.sequence = c(2:22,1))) ## stops with warning #system.time(syn_cart <- syn(acs2018,seed =3456, maxfaclevels = 200, # visit.sequence = c(2:22,1))) ##################### < 6 mins ## but to speed things up and because cart took longer I dropped puma # acs2018 <- acs2018[,-1] ## drop puma ################################### cart models ################################ # system.time(syn_cart <- syn(acs2018,seed =3456)) ################## < 5 mins ## compare(syn_cart, acs2018, plot = FALSE) ## worst is citizen compare(syn_cart, acs2018, vars = "citizen", table = TRUE) ## does not look too bad but not a citizen %s don't agree well utility.tables(syn_cart, acs2018, ntabstoprint =1) ## relations with marst dont look too good # ############################## ctree models ################################### system.time(syn_ctree <- syn(acs2018,seed =3456, method = "ctree")) ## just over 1 minute compare(syn_ctree, acs2018, plot = FALSE) compare(syn_ctree, acs2018, vars = names(acs2018)[18:21], table = TRUE) ## just look at last 4 utility.tables(syn_ctree, acs2018) # ####################### stratification and reordering for cart model################################################################# ## examine problems # # bad univariate citizen # compare(syn_cart, acs2018, vars = "citizen", table = TRUE) # ## bad bivariate age and marst # multi.compare(syn_cart, acs2018, var = "age", by= "marst", binwidth = 2) # # too many youbg widows amd old sungle people # try this # stratified by marst and moving order of citizen also reduced cart.cp to get bigger models # system.time(syn_cart2 <- syn.strata(acs2018, seed =3456, method = "cart", strata = "marst", visit.sequence = c(1,2,6,7,3:5, 8:21), cart.cp = 1e-12 )) ## under 5 mins compare(syn_cart2, acs2018, plot = FALSE)## much better compare(syn_cart2, acs2018, vars= "citizen", table = TRUE) ## fine now utility.tables(syn_cart2, acs2018, ntabstoprint = 2) ## still a few pairs not great multi.compare(syn_cart2, acs2018, var = "age", by= "marst") ## this one fixed by stratifying multi.compare(syn_cart2, acs2018, var = "citizen", by= "hinscare") ## this one still not great ## too many naturalised have this kind of insurance in synthetic data ## #try putting citizen englang hinscare sex and age at the start withcatall # meth <- syn_cart$method meth[c(1,2,6,7,12)] <- "catall" system.time(syn_cart3 <- syn.strata(acs2018, seed =3456, method = meth,numtocat = "age", strata = "marst", visit.sequence = c(1,2,6,7,12, 3:5, 8:11,13:21), cart.cp = 1e-12 )) ## under 5 mins compare(syn_cart3, acs2018, plot = FALSE)## much better utility.tables(syn_cart3, acs2018, ntabstoprint = 2) ## still one bad one multi.compare(syn_cart3, acs2018, var = "age", by= "hinscare") ## a steep transition at age 65 and over for this type of insurance ## check the original data for this with(acs2018, table(age,hinscare)) # # add the over65 grouping variable to the stratification to get the transition modelled correctly # acs2018$ov65 <- factor(acs2018$age >64) meth <- c(meth,"constant") ## ret its method to 'constant' as it will be in each stratum system.time(syn_cart4 <- syn.strata(acs2018, seed =3456, method = meth,numtocat = "age", strata = c("ov65","marst"), visit.sequence = c(1,2,6,7,12, 3:5, 8:11,13:21), cart.cp = 1e-12 )) ## under 5 mins compare(syn_cart4, acs2018, plot = FALSE)## all good utility.tables(syn_cart4, acs2018, ntabstoprint = 1) ## still one bad one different multi.compare(syn_cart4, acs2018, var = "age", by= "hinscare") ## this one now ok # # add the other insurance variables to catall at the start # meth[c(1,2,6:12)]<- "catall" system.time(syn_cart5 <- syn.strata(acs2018, seed =3456, method = meth,numtocat = "age", strata = c("ov65","marst"), visit.sequence = c(1,2,6:12, 3:5, 13:21), cart.cp = 1e-12 )) ## under 5 mins compare(syn_cart5, acs2018, plot = FALSE)## all good utility.tables(syn_cart5, acs2018, ntabstoprint = 1, max.scale = 30) ## pretty good this will have to do!! multi.compare(syn_cart5, acs2018, var = "age", by= "hinscare") ## a steep transition at age 65 and overok now # #############syn_cart5 to release ################################################################################ replicated.uniques(syn_cart5, acs2018) # # 9.7% replicated uniques - bit too may to drop # syn_cart5_sdc <- sdc(syn_cart5, acs2018, label =" made up data", recode.vars = c("inctot", "age"), bottom.top.coding = list(c(-4000,600000), c(NA,90)), smooth.vars = "inctot" ) ## now syn_cart5_sdc$syn is ready to be released head(syn_cart5_sdc$syn)