%macro table(outdata,indatay,ywt,yvar,
rowvar,colvar,pagevar,
indatax,xwt,xvar,RCPx,rowfmt,colfmt,pagefmt,nhouse);

/*
SAS macro for computing summary statistics and standard
errors from NPTS.

NOTE:  THIS DRAFT MACRO IS NOT READY FOR PUBLIC RELEASE.

NOTE:  This program is distributed in the hope that it will
be useful, but WITHOUT ANY WARRANTY, without even an
implied warranty of MERCHANTABILITY or FITNESS FOR A
PARTICULAR PURPOSE.

AUTHOR: Rick Schmoyer (ric@ornl.gov)

ACKNOWLEDGEMENT:  Thanks to Jenny Young for finding bugs.

ARGUMENTS:
outdata=output data set.  Necessary.

indatay=input data set containing y variable (yvar).  Necessary.
MUST BE SORTED BY VARSTRAT SUBSTRAT HOUSEID;
Unchanged on output.

ywt=weight for numerator variable in ratio (if xwt defined).  Necessary.
yvar=analysis variable for numerator in ratio.  Optional.

rowvar=first class variable (defines rows).  Optional,
but necessary if colvar is defined.

colvar=second class variable (defines columns). Optional,
but necessary if pagevar is defined.

pagevar=third class variable (defines pages).  Optional.

indatax=input data set containing x variable (xvar).  Optional,
but MUST BE SORTED BY VARSTRAT SUBSTRAT HOUSEID.  Can be the same
as indatay.  Unchanged on output.

xwt=weight for denominator in ratio.  Optional.

xvar=analysis variable for denominator in ratio.  Optional.

RCPx=subset of rowvar colvar pagevar to define denominator classes.
RCPx must be formable by deletion of zero or more entries
in the rowvar colvar pagevar list.  Optional.

rowfmt=name of user-supplied format for row variable (to override default)

colfmt=name of user-supplied format for column variable (to override default)

pagefmt=name of user-supplied format for page variable (to override default)

nhouse=SAS data set with variables varstrat (stratum)
substrat (substratum) and nhouse (number of households).
nhouse must be computed from a proper count of households
not excluded by a subsetting where or because of missing
data (e.g., trips for that household, but trip variables of
interest all missing).  Household counts should not exlude
variables simply because they are not represented in indatay
(e.g., no trips for a household does not automatically imply
the household should not be counted).


NOTE:  Class variable code '.' is not admitted (i.e., is
dropped in proc means).  Use something like 999 instead.
(Further, proc means does generate '.' values for the All
categories.)

NOTE:  Formats entered implicitly along with data must be in
library 'LIBRARY'.  Formats entered explicitly as macro arguments
must be in default work library.

NOTE:  Must have mautosource options and SASAUTOS= defined right
to call SAS system macros (see sasautos).


OUTPUT VARIABLES in outdata ('&' denotes macro argument):
NOTE:  THESE VARIABLE NAMES ARE RESERVED WORDS.

&rowvar   Same as input
&colvar   Same as input
&pagevar  Same as input

&yvar     If &yvar is not blank then weighted sum of input &yvar.
          If &yvar is blank, then macro variale yvar is reassigned value
	  &ywt, and &yvar becomes sum of &yvar (&ywt).

&ywt      Sum of input &ywt

xvar      If &xvar is blank then macro variable xvar is reassigned value
	  &xwt.  Then &xvar is renamed to xvar.  This avoids problem
	  when &xvar=&yvar or &xwt=&ywt.

ny        Number of observations with a &yvar, or, if no &yvar, then &ywt
nx        Number of observations with an xvar

std_erry  Standard error of &yvar total
se_ywt    Standard error of &ywt total
std_errx  Standard error of xvar total

ratiox    Ratio of &yvar to xvar totals
ratioy    Ratio of &yvar to &ywt totals

se_ratx   Standard error of ratiox--based on Taylor approximation
se_raty   Standard error of ratioy--based on Taylor approximation

se_ratx1  | Standard errors of ratios when ratio denominator
se_raty1  | is FIXED, as when denominator is a control total

corrx     Weighted correlation of xvar and &yvar 
corry     Weighted correlation of &yvar and &ywt 

lcb_fx    | Lower and Upper Fieller-type 95% two-sided confidence
ucb_fx    | bounds for ratio estimates (see Fieller E. (1932), "The
lcb_fy    | distribution of the index in normal bivariate populations," 
ucb_fy    | BIOMETRIKA, 24, 428-440) based on se_ratx and se_raty.

lcb_tx    | 
ucb_tx    | Lower and Upper 95% two-sided confidence bounds for
lcb_ty    | ratio estimates, based on Taylor approximation for
ucb_ty    | variance of ratio.   Based on se_ratx and se_raty. 


NOTE: Ratio LCBs are allowed to be negative to accommodate
cases when numerators are negative.  If they turn out
negative when they are logically nonnegative, set them to
zero.  But if ratio estimate DENOMINATORS turn out
nonpositive, ratio estimates and variances are not computed.
Further, Fieller confidence bounds are not computed if (1)
denominators are not SIGNIFICANTLY different from zero,
which occurs when when A < 0 in quadratic equation Az**2 +
Bz + C = 0 for confidence bounds (i.e., roots are confidence
bounds--see below), or (2) when modulus B**2 - 4AC is
negative (no real roots).

NOTE: Confidence bounds are based on normal approximation.
*/

%local dsn dsnx crossx crossy covx covy std_errx
dsid rfmt rowtype cfmt coltype pfmt pagetype rc
rowlib collib pagelib ycode rowf colf pagf yvarlab
ywtlab xlabel;

%if &rowfmt ne %then %let rowlib=work;
%else %let rowlib=library;

%if &colfmt ne %then %let collib=work;
%else %let collib=library;

%if &pagefmt ne %then %let pagelib=work;
%else %let pagelib=library;

*STEP 1:  Weight analysis variables and assign formats;
data;
set &indatay (keep=varstrat substrat houseid
&yvar &ywt &rowvar &colvar &pagevar);

%if &yvar= %then %do;
   %let yvar=&ywt;
   %let ycode=NR;  *No ratio;
   if _n_=1 then do;
      length yvarlab $ 40;
      call label(&ywt,yvarlab);
      call symput('yvarlab',trim(left(yvarlab)));
      drop yvarlab;
   end;
%end;
%else %do;
   &yvar=&yvar*&ywt;
   %let ycode=R;  *Ratio;
   %let crossy=crossy;
   %let covy=covy;
   if _n_=1 then do;
      length yvarlab ywtlab $ 40;
      call label(&ywt,ywtlab);
      call label(&yvar,yvarlab);
      call symput('ywtlab',trim(left(ywtlab)));
      call symput('yvarlab',trim(left(yvarlab)));
      drop ywtlab yvarlab;
   end;
%end;

%if &rowfmt ne %then %str(format &rowvar &rowfmt..;);
%if &colfmt ne %then %str(format &colvar &colfmt..;);
%if &pagefmt ne %then %str(format &pagevar &pagefmt..;);
run;
%let dsn=%trim(%left(&syslast));

%if &xwt ne %then %do;
   data;
   set &indatax (keep=varstrat substrat houseid &xvar &xwt &RCPx);
   %if &xvar= %then %let xvar=&xwt;
   %else %str(&xvar=&xvar*&xwt;);
   rename &xvar=xvar;
   run;
   %let dsnx=%trim(%left(&syslast));

   *Associate user assigned formats for X data;
   %let dsid=%sysfunc(open(&dsnx,i));
   %if (&rowvar ne ) and (&rowfmt ne ) %then %if
      %sysfunc(varnum(&dsid,&rowvar)) %then %let rowf=1;
   %if (&colvar ne ) and (&colfmt ne ) %then %if
      %sysfunc(varnum(&dsid,&colvar)) %then %let colf=1;
   %if (&pagevar ne ) and (&pagefmt ne ) %then %if
      %sysfunc(varnum(&dsid,&pagevar)) %then %let pagf=1;
   %let xlabel=%sysfunc(varlabel(&dsid, %sysfunc(varnum(&dsid,xvar))));
   %if %quote(&xlabel) eq %then %let xlabel=&xvar;
   %let rc=%sysfunc(close(&dsid));

   proc datasets;
   modify %scan(&dsnx,2,.);
   %if &rowf ne %then %str(format &rowvar &rowfmt..;);
   %if &colf ne %then %str(format &colvar &colfmt..;);
   %if &pagf ne %then %str(format &pagevar &pagefmt..;);
%end;

%else %do;
   data;
   run;
   %let dsnx=%trim(%left(&syslast));
%end;

*STEP 2:  For each class C, sum up weights to household
(ultimate cluster) level.  Note that C may be an
intersection or union.  Classes are defined by rowvar colvar pagevar;

proc means data=&dsn noprint;
%if &rowvar ne %then %str(class &rowvar &colvar &pagevar;);
by varstrat substrat houseid;
%if &ycode=NR %then %str(var &yvar;);
%else %str(var &yvar &ywt;);
output out=&dsn (drop=_FREQ_ _TYPE_)
n=ny
sum=;
run;

%if &xwt ne %then %do;

   %let crossx=crossx;
   %let covx=covx;

   proc means noprint data=&dsnx;
   %if &RCPx ne %then %str(class &RCPx;);
   by varstrat substrat houseid; 
   var xvar;
   output out=&dsnx (drop=_TYPE_ _FREQ_)
   n=nx
   sum=;
   run;

   *Idea now is to merge each possibly broader x-total with
   its corresponding y-total;

   proc sort data=&dsnx;
   by varstrat substrat &RCPx houseid;

   proc sort data=&dsn;
   by varstrat substrat &RCPx houseid;

   data &dsn;
   merge &dsn (in=in1) &dsnx (keep=varstrat substrat houseid &RCPx xvar);
   by varstrat substrat &RCPx houseid;
   if in1;
   crossx=xvar*&yvar;
   drop xvar;

   *STEP 3:  Summary stats--sums over households--by substratum
   within stratum for each class C.  Class statement is not
   used because _TYPE_ has already been defined.;

   proc means noprint data=&dsnx;
   by varstrat substrat &RCPx;
   var xvar nx;
   output out=&dsnx (drop=_freq_ _type_)
   sum=
   uss(xvar)=ussx;

   *Merge in ultimate cluster frequency in each substratum;
   data &dsnx;
   merge &dsnx (in=in1) &nhouse;
   by varstrat substrat;

   if in1;
      if nhouse gt 1 then
      std_errx=(nhouse/(nhouse-1))*(ussx - xvar*xvar/nhouse);

      *If substratum has only one observation then estimate
      variance as square of total (expectation is variance + mean
      squared).  This is OK as long as there are not many
      singletons, but may be extremely conservative if there are.;

      else std_errx=xvar*xvar;
%end;

%if &ycode=R %then %do;
   data &dsn;
   set &dsn;
   crossy=&yvar*&ywt;
%end;

proc sort data=&dsn;
by varstrat substrat &rowvar &colvar &pagevar;

proc means noprint data=&dsn;
by varstrat substrat &rowvar &colvar &pagevar;
%if &ycode=R %then %str(var &yvar &ywt &crossy &crossx ny;);
%else %str(var &yvar &crossx ny;);
output out=&dsn (drop=_freq_ _type_)
sum=
%if &ycode=R %then %str(uss (&yvar &ywt)=uss ussw;);
%else %str(uss (&yvar)=uss;);

*Merge in x-data plus ultimate cluster frequency in each
substratum (nhouse);

%if &xwt ne %then %do;
   proc sort data=&dsn;
   by varstrat substrat &RCPx;

   data &dsn;
   merge &dsn (in=in1) &dsnx (keep=nhouse xvar varstrat substrat &RCPx);
   by varstrat substrat &RCPx;

   if in1;

      if nhouse gt 1 then do;
	 std_erry=(nhouse/(nhouse-1))*(uss - &yvar*&yvar/nhouse);
	 covx=(nhouse/(nhouse-1))*(&crossx - xvar*&yvar/nhouse);
	 %if &ycode=R %then %do;
	    covy=(nhouse/(nhouse-1))*(&crossy - &yvar*&ywt/nhouse);
	    se_ywt=(nhouse/(nhouse-1))*(ussw - &ywt*&ywt/nhouse);
	 %end;
      end;

   *If substratum has only one observation then estimate
   variance as square of total (expectation is variance + mean
   squared).  This is OK as long as there are not many
   singletons, but may be extremely conservative if there are.;

      else do;
	 covx=xvar*&yvar;
	 std_erry=&yvar*&yvar;
	 %if &ycode=R %then %do;
	    covy=&ywt*&yvar;
	    se_ywt=&ywt*&ywt;
	 %end;
      end;

   drop nhouse xvar;

%end;

%else %do;
   data &dsn;
   merge &dsn (in=in1) &nhouse;
   by varstrat substrat;

   if in1;

      if nhouse gt 1 then do;
	 std_erry=(nhouse/(nhouse-1))*(uss - &yvar*&yvar/nhouse);
	 %if &ycode=R %then %do;
	    covy=(nhouse/(nhouse-1))*(&crossy - &yvar*&ywt/nhouse);
	    se_ywt=(nhouse/(nhouse-1))*(ussw - &ywt*&ywt/nhouse);
	 %end;
      end;

      else do;
	 std_erry=&yvar*&yvar;
	 %if &ycode=R %then %do;
	    se_ywt=&ywt*&ywt;
	    covy=&ywt*&yvar;
	 %end;
      end;
   drop nhouse;
%end;

*STEP 4:  Tally up over substrata within classes;

proc means noprint data=&dsn nway missing;
class &rowvar &colvar &pagevar;
%if &ycode=R %then %str(var ny &yvar &ywt std_erry se_ywt &covx &covy;);
%else %str(var ny &yvar std_erry &covx;);
output out=&dsn (drop=_freq_ _type_)
sum=;

%if &xwt ne %then %do;
   %if &RCPx ne %then %do;
      proc means noprint data=&dsnx nway missing;
      class &RCPx;
      var nx xvar std_errx;
      output out=&dsnx (drop=_freq_ _type_)
      sum=;

      proc sort data=&dsn;
      by &RCPx;

      data &outdata;
      merge &dsn &dsnx;
      by &RCPx;
   %end;

   %else %do;
      proc means noprint data=&dsnx;
      var nx xvar std_errx;
      output out=&dsnx (drop=_freq_ _type_)
      sum=;

      data &outdata;
      retain xvar std_errx;
      set &dsn;
      if _n_ eq 1 then set &dsnx (keep=nx xvar std_errx);
   %end;

   if xvar gt 0 then do;
      ratiox=&yvar/xvar;
      se_ratx=(std_erry + std_errx*ratiox**2 - 2*covx*ratiox)/(xvar**2);
      se_ratx1=std_erry/xvar**2;

      *SE_RAT1 is std. err. of ratio when denominator is FIXED--as when
      its a control total.  How to identify this case?

      *Fieller-type 95% two-sided confidence bounds;
      t=probit(.975);
      A=xvar**2-t**2*std_errx;
      B=2*(t**2*covx-&yvar*xvar);
      C=&yvar**2-t**2*std_erry;

      MOD=B*B-4*A*C;

      *IF A lt 0, then xvar (denominator in ratio) is not
      significantly different from 0--ratio does not make much
      sense;
      if A gt 0 then do;
	 if MOD ge 0 then do;
	    MOD=sqrt(MOD);
	    ucb_fx=(-B+MOD)/(2*A);
	    lcb_fx=(-B-MOD)/(2*A);
	 end;
      end;
      else if A eq 0 then do;
	 if B gt 0 then ucb_fx=-C/B;
	 else if B lt 0 then lcb_fx=-C/B;
      end;
      if se_ratx ne . then se_ratx=sqrt(max(se_ratx,0));
      if se_ratx1 ne . then se_ratx1=sqrt(max(se_ratx1,0));
      *Taylor-type 95% two-sided confidence bounds;
      lcb_tx=ratiox-t*se_ratx;
      ucb_tx=ratiox+t*se_ratx;
      drop t A B C MOD;
   end;

   if std_errx gt 0 and std_erry gt 0 then
      corrx=covx/(sqrt(std_errx)*sqrt(std_erry));
   drop covx;

   std_errx=sqrt(std_errx);
   label
   nx="Num. with &xlabel"
   std_errx="SE &xlabel"
   ratiox="Ratio &yvarlab to &xlabel"
   se_ratx="Taylor SE Ratio"
   se_ratx1='Fixed-denom. Ratio SE'
   corrx="Corr &xlabel and &yvarlab" 
   lcb_fx='Fieller LCB X-Ratio'
   ucb_fx='Fieller UCB X-Ratio'
   lcb_tx='Taylor LCB X-Ratio'
   ucb_tx='Taylor UCB X-Ratio';
%end;
%else %do;
   data &outdata;
   set &dsn;
%end;

%if &ycode=R %then %do;

   if &ywt gt 0 then do;
      ratioy=&yvar/&ywt;
      se_raty=(std_erry + se_ywt*ratioy**2 - 2*covy*ratioy)/(&ywt**2);
      se_raty1=std_erry/&ywt**2;

      *SE_RATY1 is std. err. of ratio when denominator is FIXED--as when
      its a control total.  How to identify this case?

      *Fieller-type 95% two-sided confidence bounds;
      t=probit(.975);
      A=&ywt**2-t**2*se_ywt;
      B=2*(t**2*covy-&yvar*&ywt);
      C=&yvar**2-t**2*std_erry;

      MOD=B*B-4*A*C;

      *IF A lt 0, then ywt (denominator in ratio) is not
      significantly different from 0--ratio does not make much
      sense;
      if A gt 0 then do;
	 if MOD ge 0 then do;
	    MOD=sqrt(MOD);
	    ucb_fy=(-B+MOD)/(2*A);
	    lcb_fy=(-B-MOD)/(2*A);
	 end;
      end;
      else if A eq 0 then do;
	 if B gt 0 then ucb_fy=-C/B;
	 else if B lt 0 then lcb_fy=-C/B;
      end;
      if se_raty ne . then se_raty=sqrt(max(0,se_raty));
      if se_raty1 ne . then se_raty1=sqrt(max(0,se_raty1));
      *Taylor-type 95% two-sided confidence bounds;
      lcb_ty=ratioy-t*se_raty;
      ucb_ty=ratioy+t*se_raty;
      drop t A B C MOD;
   end;

   se_ywt=sqrt(se_ywt);
   if se_ywt gt 0 and std_erry gt 0 then
      corry=covy/(se_ywt*sqrt(std_erry));
   drop covy;

   label
   se_ywt="SE &ywtlab"
   ratioy="Ratio &yvarlab to &ywtlab"
   se_raty="Taylor SE Ratio"
   se_raty1='Fixed-denom. Ratio SE'
   corry="Corr &yvar and &ywtlab" 
   lcb_fy='Fieller LCB Ratio' 
   ucb_fy='Fieller UCB Ratio' 
   lcb_ty='Taylor LCB Ratio'
   ucb_ty='Taylor UCB Ratio';
%end;

std_erry=sqrt(std_erry);
label ny="Num. with &yvarlab"
      std_erry="SE &yvarlab";

*Return to rowvar colvar pagevar order;
%if &rowvar ne %then %do;
   proc sort data=&outdata;
   by &rowvar &colvar &pagevar;
%end;

*Cleanup;
proc datasets; delete %scan(&dsn,2,.) %scan(&dsnx,2,.);
run;
%mend table;