Online File
Rick Aster: Professional SAS Programming Shortcuts: Contents
data work.word (keep=word wordtext length sentence sn)
/ view=work.word;
length wordtext word $ 24 endchar $ 1;
retain sentenceend 0 sentence 1;
infile text flowover;
input wordtext @@;
*
Word: convert word text to uppercase
and remove most punctuation.
*;
word = upcase(compress(wordtext,
'!"#$%&()*+,-./:;<=>?@[\]^`{|}~'));
length = length(compress(word, "'-"));
if word ne '' then do;
* Check for sentence break. ;
if sentenceend then do;
* If first letter is uppercase, start new sentence. ;
li = anyalpha(word);
if li then letter = substr(word, li);
else letter = ' ';
if anyalpha(letter) then do;
sentence + 1;
sn = 0;
end;
else sentenceend = 0;
end;
n + 1;
sn + 1;
output;
end;
*
Possible end of sentence: word text ends in period,
exclamation point, or question mark, possibly followed
by quotation marks.
*;
if sn > 0 then do;
endchari = length(translate(wordtext, ' ', '"'''));
endchar = substr(wordtext, endchari, 1);
sentenceend = endchar in ('.', '!', '?');
end;
run;
data work.sentence (keep=sentence sn rename=(sn=slength));
set work.word;
by sentence;
if last.sentence;
run;
proc summary data=work.word;
var length;
output mean= out=work.wsum (rename=(_freq_=wcount));
run;
proc summary data=work.sentence;
var slength;
output mean= out=work.ssum (rename=(_freq_=scount));
run;
title1 'Thoreau Paragraph';
data _null_;
set work.wsum;
set work.ssum;
file print;
put / 'Word count: ' wcount : comma9.
/ 'Average word length: ' length : 5.2
/ 'Sentence count: ' scount : comma7.
/ 'Words per sentence: ' slength : 6.2;
run;
proc summary data=work.word order=freq;
class word;
output out=work.wordlist1 (rename=(_freq_=n));
run;
data wordlist (keep=word n percent);
if _n_ = 1 then set work.wordlist1 (where=(_type_ = 0)
rename=(n=wordcount));
set work.wordlist1 (where=(_type_ > 0));
percent = n/wordcount*100;
run;
proc print data=wordlist (where=(percent >= 1))
heading=horizontal noobs;
var word n percent;
format n comma6. percent f7.3;
run;