############ set up ################################################# # move to directory where you have saved the data file # setwd("D://sls/training/slsjune22/webpage") rm(list = ls()) ## clean out workspace library(synthpop) # # examine and check data - especially missing values # help(SD2011) codebook.syn(SD2011)$tab ## overall table codebook.syn(SD2011)$labs$placesize ## to see labels for this variable summary(SD2011) ##################################### select a set of 7 variables ####################### mydata <- SD2011[,c("sex","age", "alcabuse","smoke","nociga","height","weight","bmi")] codebook.syn(mydata)$tab summary(mydata) ########################################################################## ## note I have put plot = FALSE in compare so as not to have to wait for ## plots and press return. Remove this to see plots ## ########################################################################################### ############################### try default syn no worries ##################################################### syn.default <- syn(mydata, seed =7676) compare(syn.default, mydata, plot =FALSE) ## look OK utility.tables(syn.default, mydata, tables = "twoway") ## alcabuse and sex not good # ################ try stratifying by sex ######################### syn.default.strat <- syn.strata(mydata, strata = "sex", seed = 5634) compare(syn.default.strat, mydata, plot =FALSE) ## ;ook OK utility.tables(syn.default.strat, mydata, tables = "twoway", ntabstoprint = 1) ## alcabuse and sex better alcabuse and age worst but not too bad # #################### now look at some details of bmi ################################# # it should be calculated from height and weight # check it in the real and synthetic data # par(mfrow = 1:2) mydata$calcbmi <- mydata$weight/mydata$height^2*10000 head(mydata) with(mydata, plot(bmi, calcbmi, col ="dark red", pch = 20, cex=2)) mysyndata <- syn.default.strat$syn #mysyndata <- syn.v2$syn mysyndata$calcbmi <- mysyndata$weight/mysyndata$height^2 with(mysyndata, plot(bmi, calcbmi, col ="dark red", pch = 20, cex=2)) ### oops not so good # # something funny happening # par(mfrow = c(1,1)) mydata <- mydata[,-9] ## drop calculated one # # turns out on looking at the data that there are some values of bmi where # height and/or weight are missing these look like data errors of # some sort and bmi ought to have been set to missing for them # mydata$bmi[is.na(mydata$height) | is.na(mydata$weight)] # mydata$bmi[is.na(mydata$height) | is.na(mydata$weight)] <- NA # this gets rid of one huge one ############################## passive synthesis for bmi ############################# # mymethod <- syn.default$method mymethod mymethod[8] <- "~I(weight/height^2*10000)" mymethod syn.v2 <- syn.strata(mydata, strata = "sex", method = mymethod) summary(syn.v2$syn) # # calculate bmi passively # compare(syn.v2, mydata)#, plot =FALSE) ## look OK utility.tables(syn.v2, mydata, tables = "twoway", ntabstoprint = 1) ## alcabuse and sex better alcabuse and age worst but not too bad # # you might want to run the code for the computed variable above with new synthesis # see line commented out above # ###################### now check -8 values in nociga #################################################### # in case you want to define a cont.na for this variable # with(mydata, table(nociga, smoke)[1:5,]) ## some -8s are smokers with(syn.v2$syn, table(nociga, smoke)[1:5,]) ## similar in synthetic # # did not really need cont.na for this variable # ############################### parametric ############################### # syn.para <- syn(mydata, method = "parametric") parmeth <- syn.para$method ## to allow changing parmeth parmeth[8] <- "~I(weight/height^2*10000)" # # also -8 will be needed for nociga in a parametric synthesis # syn.para.cont.na <- syn(mydata, method = parmeth, cont.na = list(nociga = -8)) compare(syn.para.cont.na, mydata, plot =FALSE) ## better utility.tables(syn.para.cont.na, mydata, tables = "twoway", ntabstoprint = 1) # ## alcabuse and sex better alcabuse and age worst but not too bad # not too bad but cart was better so use that one for sdc # ############################ save data for cart models ################ # syn.v2.sdc <- sdc(syn.v2, mydata, label="false_data", rm.replicated.uniques = TRUE, recode.vars = c("age"), bottom.top.coding = c(NA,80), recode.exclude = c(NA,-8)) mysyndata <- syn.v2.sdc$syn ################### ready to save and send off table(mysyndata$age) head(mysyndata)