/*
/ Program : dlm2sas.sas / Version : 2.0 / Author : Roland Rashleigh-Berry / Date : 07-Jul-2014 / Purpose : To read in a delimited flat file and convert it to a sas dataset / with all variables character by default. / SubMacros : %varlist / Notes : This macro is designed to force fields to be character when a flat / file is converted to a SAS dataset to avoid problems where values / can start as numeric and then turn into alphanumeric too late to / correct the variable type. All the fields will be character and / you must either run a following data step to fix lengths and types / or you can define a dataset to the dsfixvars= parameter to fix / variables in the output dataset. / / This will only work on SIMPLE files where the delimiter is never / included in a valid text field so you should not use this on comma / delimited files as a comma might be part of a valid text string. / / All the character fields are the number of bytes long that is / specified to colw=. If you want to shorten these fields to more / suitable lengths then this must be done in a subsequent data step / or you can define a dataset to the dsfixvars= parameter. / / The %optlength macro might be useful for finding out the optimum / length of character fields. / / If you use getnames=yes (default) then any text found in the first / line is converted if needed to form valid variable names. / / Usage : data fix; / length name informat $ 32 type $ 4; / name='studyid';length=30;type='char';output; / name='ptm';length=8;type='num';informat='??time6.';output; / run; / / %dlm2sas(C:\Users\rashleig\Downloads\xxxxx.txt,test, / dsfixvars=fix); /=============================================================================== / PARAMETERS: /-------name------- -------------------------description------------------------ / infile (pos) (unquoted) Full path name of input file. Enclose in / %nrstr() if the path name contains spaces, "&" or "%". / dsout (pos) Output dataset name (defaults to _dlm2sas) (modifiers / are allowed) / delimiter="09"x Delimiter character (defaults to horizontal tab) / colw=256 Number of bytes to allocate to each column / getnames=yes By default, use the first row as the source of the column / names otherwise columns are named C1, C2 etc.. / termstr=CRLF Terminating characters for each line. CRLF is for Windows / platforms where the file is a pure ascii file. LF is for / Unix platforms or utf-8 encoded files. You can use the / %termstr macro to help you decide this value. / dsfixvars= Dataset containing the fields "name" (char), "type" (char4), / "length" (num) and "informat" (char) for correcting the / output variable characteristics in the output dataset if it / finds a match on variable name. Note that the "informat" / field in this dataset will be APPLIED to transform the / variable and not be used to give the output dataset variable / that informat as an attribute. /=============================================================================== / AMENDMENT HISTORY: / init --date-- mod-id ----------------------description------------------------ / rrb 02Jul11 Add checking of input parameters (v1.1) / rrb 21Mar13 rrb001 Algorithm changed for deriving variable name from string / containing invalid characters for variable naming so that / multiple adjacent invalid characters are replaced by a / single underscore instead of multiple underscores as / used to be the case (v1.2) / rrb 29Apr13 %nrbquote(), %superq() and %nrstr() used to mask file / name and file path must be unquoted (v1.3) / rrb 02May13 termstr=CRLF parameter added (v1.4) / rrb 07Jul14 dsfixvars= processing added, macro purpose updated and / header info improved (v2.0) /=============================================================================== / This is public domain software. No guarantee as to suitability or accuracy is / given or implied. User uses this code entirely at their own risk. /=============================================================================*/ %put MACRO CALLED: dlm2sas v2.0; %macro dlm2sas(infile ,dsout ,delimiter="09"x ,colw=256 ,getnames=yes ,termstr=CRLF ,dsfixvars= ); %local maxcol colnames err varlist savopts; %let err=ERR%str(OR); %if not %length(&infile) %then %do; %put &err: (dlm2sas) No file path specified; %goto exit; %end; %else %do; %if not %sysfunc(fileexist(%nrbquote(&infile))) %then %do; %put &err: (dlm2sas) File "&infile" can not be found; %goto exit; %end; %end; %if not %length(&dsout) %then %let dsout=_dlm2sas; %if not %length(&colw) %then %let colw=256; %if not %length(&delimiter) %then %let delimiter="09"x; %if not %length(&getnames) %then %let getnames=yes; %let getnames=%upcase(%substr(&getnames,1,1)); data _null_; length name $ 32 colnames $ 1024; infile "%nrstr(%superq(infile))" pad lrecl=32767 termstr=&termstr; input; maxcol=countc(_infile_,&delimiter)+1; call symput('maxcol',put(maxcol,best.)); %if &getnames EQ Y %then %do; i=1; name=trim(scan(_infile_,i,&delimiter)); do while(name ne ""); *- make sure name syntax is valid -; *----- rrb001: the line directly below is the old method no longer used -----; ****name=upcase(translate(trim(name),"____________________"," '&%+()@/\#?=$!.,:-")); name=upcase(prxchange('s§[^a-zA-Z0-9]+§_§',-1,trim(name))); colnames=trim(colnames)||" "||name; i=i+1; name=trim(scan(_infile_,i,&delimiter)); end; %end; %else %do; name=""; do i=1 to maxcol; colnames=trim(colnames)||" C"||compress(put(i,3.)); end; %end; call symput('colnames',trim(colnames)); stop; run; data %if not %length(&dsfixvars) %then %do; &dsout %end; %else %do; _dlm2 %end; ; informat &colnames $&colw.. ; format &colnames $&colw.. ; infile "%nrstr(%superq(infile))" delimiter=&delimiter MISSOVER DSD lrecl=32767 termstr=&termstr %if &getnames EQ Y %then %do; firstobs=2 %end; ; input &colnames; run; %if %length(&dsfixvars) %then %do; *- store the option setting for the variable length warning -; %let savopts=%sysfunc(getoption(varlenchk,keyword)); options varlenchk=nowarn; *- create a list of variables for a final KEEP statement -; %let varlist=%varlist(_dlm2); *- get the contents of the temporary dataset -; proc contents data=_dlm2 noprint out=_dlm2cont(keep=name type length); run; *- standardise the variable characteristics -; data _dlm2cont; length type $ 1; set _dlm2cont(rename=(type=old_type)); name=upcase(name); if old_type=1 then type='N'; else type='C'; DROP old_type; run; proc sort data=_dlm2cont; by name type; run; *- standardise the variable characteristics -; data _dlm2fix; length type $ 1; set &dsfixvars; type=upcase(type); name=upcase(name); KEEP name type length informat; run; proc sort data=_dlm2fix; by name type; run; *- merge the new variable information on top -; data _dlm2cont; merge _dlm2cont(in=_cont) _dlm2fix(rename=(type=new_type length=new_length)); by name; if _cont; if (new_type NE type) or (new_length NE length) ; run; *- set up fileref for generated code -; filename _dlmcode TEMP; *- generate the code to reset lengths, rename variables and convert them -; data _null_; length str $ 120; file _dlmcode; *- generate the LENGTH statement -; put 'LENGTH '; do ptr=1 to nobs; set _dlm2cont point=ptr nobs=nobs; str=trim(name); if new_type EQ 'N' then str=str; else if new_type EQ 'C' then str=trim(str)||" $"; else if type EQ 'C' then str=trim(str)||" $"; if new_length NE . then str=trim(str)||" "||strip(put(new_length,4.)); else if length NE . then str=trim(str)||" "||strip(put(length,4.)); put str; end; put ';'; *- Generate the SET statement with the renames -; *- (do not worry if there are no renames because -; *- SAS wont complain about empty declarations). -; put 'SET _dlm2(rename=('; do ptr=1 to nobs; set _dlm2cont point=ptr; if new_type='N' or informat NE ' ' then do; str=trim(name)||'=char_'||name; put str; end; end; put '));'; *- generate the transformation of character to numeric -; do ptr=1 to nobs; set _dlm2cont point=ptr; if new_type='N' or informat NE ' ' then do; if new_type='N' and informat=' ' then informat='comma32.'; str=trim(name)||'=input(char_'||trim(name)||','||trim(informat)||');'; put str; end; end; stop; run; *- create the final output dataset keeping only the original variables -; DATA &dsout; %include _dlmcode / source; KEEP &varlist; run; *- clear the generated code fileref -; filename _dlmcode CLEAR; *- tidy up -; proc datasets nolist memtype=data; delete _dlm2 _dlm2cont _dlm2fix; quit; *- reset variable length warning option -; options &savopts; %end; %goto skip; %exit: %put &err: (dlm2sas) Leaving macro due to problem(s) listed; %skip: %mend dlm2sas;