/*************************************************************** * * QuantileNormalize: Macro for micro array quantile normalization * * Definition: * QuantileNormalize(data,id,probe,var,normvar,expr=,statexpr=,out=,temp=t_,keeptemp=0) * * Parameters: * data = data set with expression data * id = array identifier * probe = probe identifier (must be unique within array and non-missing) * var = expression variable * normvar = variable to use for normalized expression (default=&var.) * expr = expression to use in normalization (default=&var.) * statexpr = expression used in pre-normalization statistics (default=&var.) * invexpr = inverse transformation (expressed in terms of &normvar.) at the end * out = output data set containing normalized values * temp = prefix for temporary data sets (default=t_) * keeptemp = set to 1 to keep temporary datasets (default=0) * * The macro produces the empirical cumulative distributions, estimates the average * quartiles (the normalized distribution), and fits the quartiles to this. * * By default, analyses are made on raw values, but it is possible pre-normalize the * data to adjust for some prior differences, e.g. different standard deviations. E.g. * specifying expr=(x-avg)/sd, where x is the expression level (or log-transformed * expression level, will ensure that arrays with large variation does not influence the * normalized distribution more than those with less variation. * * The statistics (avg=average, sd=standard deviation, n=#observations) are * computed for the &statexpr. expression. * * E.g. if x is the expression level (specified as var), you can log-transform by * setting expr=log(x). If you also want to pre-normalize these values, you can use * statexpr=log(x) and expr=(log(x)-avg)/sd: then avg and sd are computed for * log(x) rather than for x. * * The variable cum is added to the resulting data set containing the quantile. * * Example: * %QuantileNormalize(expr,arrayid,probeid,expr,x_norm,out=expr_norm); * ***************************************************************/ %MACRO QuantileNormalize(data,id,probe,var,normvar,out=,expr=,statexpr= ,invexpr=,temp=t_,keeptemp=0,where=1); %LOCAL n_ids n_probes; %IF %bquote(&out.)=%str() %THEN %LET out=&data.; %IF %bquote(&normvar.)=%str() %THEN %LET normvar=&var.; %IF %bquote(&expr.)=%str() %THEN %LET expr=&var.; %IF %bquote(&statexpr.)=%str() %THEN %LET statexpr=&var.; %IF %bquote(&invexpr.)=%str() %THEN %LET invexpr=&normvar.; proc sort data=&data. out=&temp.exp(keep=&id. &probe. &var.); where &where.; by &id. &var.; run; proc sql noprint; select count(distinct &id.) into :n_ids from temp; select count(distinct &probe.) into :n_probes from temp; quit; proc sql; create table &temp.stat as select &id.,count(*) as n,mean(&var.) as avg,std(&var.) as sd from &temp.exp where &var.>. group by &id. order by &id. ;quit; data &temp.ddist; merge &temp.exp(keep=&id. &probe. &var. where=(&var.>.)) &temp.stat; by &id.; array _probe_(&n_probes.) _temporary_; array _x_(0:%eval(&n_probes.+1)) _temporary_; retain i; if first.&id. then i=0; if &var.>. then do; i=i+1; _probe_(i)=&probe.; _x_(i)=&expr.; end; if last.&id. then do; if not n=i then put "WARNING: Should have " n= "equaling " i=; link distr; end; return; ; distr: wgt=1/&n_ids.; _x_(0)=.;_x_(n+1)=.; dx=0;dx_last=0; i=1; do while (i<=n); j=i+1; do while (_x_(j)=_x_(i)); j=j+1; end; if i=1 then link add_fst; if j-i=1 then link add_one; else link add_more; if j>n then link add_lst; i=j; end; return; ; output: d_cum=cum-cum_last; if x_last=. then ddx=0; else if &probe.=. then do; dx=(&var.-x_last)/d_cum; ddx=dx-dx_last; end; else ddx=0; output; cum_last=cum; x_last=&var.; dx_last=dx; return; ; add_one: &var.=_x_(i); cum=(i-.5)/n; &probe.=.; link output; &probe.=_probe_(i); link output; return; ; add_more: &var.=_x_(i); cum=(i-.5)/n; &probe.=.; link output; cum=(i+j-2)/(2*n); link output; do ii=i to j-1; &probe.=_probe_(ii); link output; end; &probe.=.; cum=(j-1.5)/n; link output; return; ; add_fst: &probe.=.; cum_last=0; x_last=.; &var.=_x_(i)-(_x_(j)-_x_(i))/2; cum=0; link output; return; ; add_lst: &probe.=.; &var.=&var.+(&var.-_x_(i-1))/2; cum=1; link output; return; ; keep &id. &probe. &var. x_last cum cum_last d_cum wgt ddx; run; proc sort data=&temp.ddist; by cum_last x_last &probe.; run; data &temp.normed(keep=&id. &probe. cum &normvar.) &temp.dist(keep=cum_last &normvar. rename=(cum_last=cum)); set &temp.ddist; by cum_last x_last; retain &normvar. 0 dx 0 prev_cum 0; if x_last=. then do; &normvar.=&normvar.+wgt*&var.; end; else if &probe.>. then output &temp.normed; else do; &normvar.=&normvar.+(cum_last-prev_cum)*dx; prev_cum=cum_last; dx=dx+wgt*ddx; end; if last.cum_last then output &temp.dist; run; proc sort data=&temp.normed; by &id. &probe.; run; data &out.; merge &data. &temp.normed(in=isNormed); by &id. &probe.; if isNormed then &normvar.=&invexpr.; else &normvar.=.; run; %IF not &keeptemp. %THEN %DO; proc datasets nolist; delete &temp.exp &temp.stat &temp.ddist &temp.dist &temp.normed; quit; %END; %MEND;