*Call in the CHS 2011. Working data set is called 'chs2011' 			*
*This program provides sample code to use when analyzing survey data	*
*There are 8792 observations and 174 variables in the dataset			*

The stratification (nesting) variable is strata
The survey weight variable is wt12_dual

*******************************************************************************************************************************
** In 2011, the CHS weighting methodology was updated to use Census 2010 as the source of population control totals, and
** American Community Survey for additional demographic characteristics. 
** Please review a detailed report "Methodology updates to the NYC CHS"
** http://www.nyc.gov/html/doh/html/data/epiresearch.shtml
******************************************************************************************************************************* 

For more information, please contact:
NYC Department of Health & Mental Hygiene
Bureau of Epidemiology Services
EpiDataRequest@health.nyc.gov
*********************************************************************;


/*enter in the pathway where dataset and format programs are stored*/

libname intdat    'x';
filename formatin 'x\formatstatements_chs2011_public.sas';
%include          'x\formats_chs2011_public.sas';

data chs2011; set intdat.chs2011_public;
run;

proc contents data=chs2011; run;


/********Instructions for analyzing CHS 2011 data*****************
Survey data need to be analyzed using a special procedure in SAS --
proc surveymeans  - or using SUDAAN or another software package like Stata that
can handle complex survey designs.  
The Bureau of Epidemiology Services recommends SUDAAN.
************************************************************************************/

**Sample code: Standard errors will not be correct with regular SAS procs, but point 
estimates will be fine. Remember to use the weight statement;
proc freq data = chs2011;
	tables sex*(smoker  generalhealth);
	weight wt12_dual; 
run;

**Sample code for proc surveymeans - standard errors are correct.
Same point estimates as code above, however, these estimates are not age-adjusted;
proc surveymeans data = chs2011 nobs mean clm sum std clsum ;
	strata strata;  *survey design information;
	weight wt12_dual; *weight statement;
	var smoker generalhealth; *variables you are interested in analyzing;
	class smoker generalhealth; *all variables in var statement that are categorical;
	domain sex;  *variable to see estimates stratified by;
run;


**Sample code for SUDAAN, proc descript**;
/*MUST SORT DATA BY STRATIFICATION VARIABLE FIRST*/
proc sort data=chs2011; by strata; run;

/*NOW RUN PROC DESCRIPT*/
proc descript data=chs2011 filetype=sas design=strwr nomarg;
nest strata; *survey strata variables*;
weight wt12_dual; *survey weight variable*;
var    smoker smoker smoker generalhealth generalhealth generalhealth generalhealth generalhealth; *variables you are interested in analyzing; ;
catlevel 1       2      3         1             2              3             4             5; *specify the levels of each variable you want *;
tables _one_ sex; *_one_ will give you the overall total for each variable: sex will produce the gender-specific estimates*;
class  _one_ sex agegroup/nofreq; *all variables on the tables statement must also be in the class statement. agegroup is needed for age-adjustment*;
/*for age-adjustment of estimate: use the US 2000 Standard Population*/
stdvar agegroup; 
stdwgt 0.128810 0.401725 0.299194 0.170271;/*These weights are for agegroup total: different age adjustment weights are needed for variables that use other agegroups*/
setenv decwidth=1; /*Produce output with results rounded to 1 decimal place*/
print/style=nchs; *will print the results*;
output/filename=output11 filetype=sas tablecell=default replace; *produces an output dataset of results*;
title1 'Prevalence of Smoking Status and General Health Status, by Gender: CHS2011';
run;

/*Compute the relative standard error of the estimates:
  Estimates with RSE >=0.30  or sample sizes <50 are considered unstable:
http://www1.nyc.gov/assets/doh/downloads/pdf/episrv/bes_data_reliability.pdf */
data rsecheck;
	set output11; *use the output dataset created from the proc descript*;
	if percent in (0.00, 100.00) then do;
		if nsum >= 50 then flag = '**';
		if nsum < 50 then flag = '^';
		end;
	else if percent not in (0.00, 100.00) then do;	 
		rse = sepercent/percent;
		ciband = uppct-lowpct;
		halfw = ciband/2;
		if sepercent = 0.0 and ciband = 0.0 then flag='^';
		else if rse =>0.5 then do;
			if ciband >=6 then flag='^';
				else if ciband < 6 then flag = '*'; 
		end;
			else if rse < 0.3 then do;
				if nsum <50 then flag='*';
					else if nsum >= 50 then do;
						if halfw > 10 then flag = '*';
					end;
			end;
				else if 0.5 > rse >=0.3 then flag='*';
	end;			
run;

options ls = 150;
proc print data = rsecheck noobs;
  var flag rse nsum ciband halfw percent lowpct uppct;
  where flag in ('*','^', '**');
run;


  /*For more details on age-adjustment, see:
  Klein RJ, Schoenborn CA. Age adjustment using the 2000 projected U.S. population. Healthy
  People Statistical Notes, no. 20. Hyattsville,Maryland: National Center for Health Statistics.
  January 2001. http://www.cdc.gov/nchs/data/statnt/statnt20.pdf */