Online File
Rick Aster: Professional SAS Programming Shortcuts: Contents
DATA WORK.WORD (KEEP=WORD WORDTEXT LENGTH SENTENCE SN)
/ VIEW=WORK.WORD;
LENGTH WORDTEXT WORD $ 24 ENDCHAR $ 1;
RETAIN SENTENCEEND 0 SENTENCE 1;
INFILE TEXT FLOWOVER;
INPUT WORDTEXT @@;
*
Word: convert word text to uppercase
and remove most punctuation.
*;
WORD = UPCASE(COMPRESS(WORDTEXT,
'!"#$%&()*+,-./:;<=>?@[\]^`{|}~'));
LENGTH = LENGTH(COMPRESS(WORD, "'-"));
IF WORD NE '' THEN DO;
* Check for sentence break. ;
IF SENTENCEEND THEN DO;
* If first letter is uppercase, start new sentence. ;
LI = ANYALPHA(WORD);
IF LI THEN LETTER = SUBSTR(WORD, LI);
ELSE LETTER = ' ';
IF ANYALPHA(LETTER) THEN DO;
SENTENCE + 1;
SN = 0;
END;
ELSE SENTENCEEND = 0;
END;
N + 1;
SN + 1;
OUTPUT;
END;
*
Possible end of sentence: word text ends in period,
exclamation point, or question mark, possibly followed
by quotation marks.
*;
IF SN > 0 THEN DO;
ENDCHARI = LENGTH(TRANSLATE(WORDTEXT, ' ', '"'''));
ENDCHAR = SUBSTR(WORDTEXT, ENDCHARI, 1);
SENTENCEEND = ENDCHAR IN ('.', '!', '?');
END;
RUN;
DATA WORK.SENTENCE (KEEP=SENTENCE SN RENAME=(SN=SLENGTH));
SET WORK.WORD;
BY SENTENCE;
IF LAST.SENTENCE;
RUN;
PROC SUMMARY DATA=WORK.WORD;
VAR LENGTH;
OUTPUT MEAN= OUT=WORK.WSUM (RENAME=(_FREQ_=WCOUNT));
RUN;
PROC SUMMARY DATA=WORK.SENTENCE;
VAR SLENGTH;
OUTPUT MEAN= OUT=WORK.SSUM (RENAME=(_FREQ_=SCOUNT));
RUN;
TITLE1 'Thoreau Paragraph';
DATA _NULL_;
SET WORK.WSUM;
SET WORK.SSUM;
FILE PRINT;
PUT / 'Word count: ' WCOUNT : COMMA9.
/ 'Average word length: ' LENGTH : 5.2
/ 'Sentence count: ' SCOUNT : COMMA7.
/ 'Words per sentence: ' SLENGTH : 6.2;
RUN;
PROC SUMMARY DATA=WORK.WORD ORDER=FREQ;
CLASS WORD;
OUTPUT OUT=WORK.WORDLIST1 (RENAME=(_FREQ_=N));
RUN;
DATA WORDLIST (KEEP=WORD N PERCENT);
IF _N_ = 1 THEN SET WORK.WORDLIST1 (WHERE=(_TYPE_ = 0)
RENAME=(N=WORDCOUNT));
SET WORK.WORDLIST1 (WHERE=(_TYPE_ > 0));
PERCENT = N/WORDCOUNT*100;
RUN;
PROC PRINT DATA=WORDLIST (WHERE=(PERCENT >= 1))
HEADING=HORIZONTAL NOOBS;
VAR WORD N PERCENT;
FORMAT N COMMA6. PERCENT F7.3;
RUN;