Online File

How to use this page


Rick Aster: Professional SAS Programming Shortcuts: Contents

Chapter 82
Program
Text analysis


DATA WORK.WORD (KEEP=WORD WORDTEXT LENGTH SENTENCE SN)
    / VIEW=WORK.WORD;
   LENGTH WORDTEXT WORD $ 24 ENDCHAR $ 1;
   RETAIN SENTENCEEND 0 SENTENCE 1;
   INFILE TEXT FLOWOVER;
   INPUT WORDTEXT @@;

   *
     Word: convert word text to uppercase
     and remove most punctuation.
   *;
   WORD = UPCASE(COMPRESS(WORDTEXT,
       '!"#$%&()*+,-./:;<=>?@[\]^`{|}~'));
   LENGTH = LENGTH(COMPRESS(WORD, "'-"));

   IF WORD NE '' THEN DO;
      * Check for sentence break. ;
      IF SENTENCEEND THEN DO;
         * If first letter is uppercase, start new sentence. ;
         LI = ANYALPHA(WORD);
         IF LI THEN LETTER = SUBSTR(WORD, LI);
         ELSE LETTER = ' ';
         IF ANYALPHA(LETTER) THEN DO;
            SENTENCE + 1;
            SN = 0;
            END;
         ELSE SENTENCEEND = 0;
         END;
      N + 1;
      SN + 1;
      OUTPUT;
      END;

   *
      Possible end of sentence: word text ends in period,
      exclamation point, or question mark, possibly followed
      by quotation marks.
   *;
   IF SN > 0 THEN DO;
      ENDCHARI = LENGTH(TRANSLATE(WORDTEXT, '  ', '"'''));
      ENDCHAR = SUBSTR(WORDTEXT, ENDCHARI, 1);
      SENTENCEEND = ENDCHAR IN ('.', '!', '?');
      END;
RUN;

DATA WORK.SENTENCE (KEEP=SENTENCE SN RENAME=(SN=SLENGTH));
   SET WORK.WORD;
   BY SENTENCE;
   IF LAST.SENTENCE;
RUN;

PROC SUMMARY DATA=WORK.WORD;
   VAR LENGTH;
   OUTPUT MEAN= OUT=WORK.WSUM (RENAME=(_FREQ_=WCOUNT));
RUN;
PROC SUMMARY DATA=WORK.SENTENCE;
   VAR SLENGTH;
   OUTPUT MEAN= OUT=WORK.SSUM (RENAME=(_FREQ_=SCOUNT));
RUN;
TITLE1 'Thoreau Paragraph';
DATA _NULL_;
   SET WORK.WSUM;
   SET WORK.SSUM;
   FILE PRINT;
   PUT / 'Word count: ' WCOUNT : COMMA9.
       / 'Average word length: ' LENGTH : 5.2
       / 'Sentence count: ' SCOUNT : COMMA7.
       / 'Words per sentence: ' SLENGTH : 6.2;
RUN;

PROC SUMMARY DATA=WORK.WORD ORDER=FREQ;
   CLASS WORD;
   OUTPUT OUT=WORK.WORDLIST1 (RENAME=(_FREQ_=N));
RUN;
DATA WORDLIST (KEEP=WORD N PERCENT);
   IF _N_ = 1 THEN SET WORK.WORDLIST1 (WHERE=(_TYPE_ = 0)
       RENAME=(N=WORDCOUNT));
   SET WORK.WORDLIST1 (WHERE=(_TYPE_ > 0));
   PERCENT = N/WORDCOUNT*100;
RUN;
PROC PRINT DATA=WORDLIST (WHERE=(PERCENT >= 1))
    HEADING=HORIZONTAL NOOBS;
   VAR WORD N PERCENT;
   FORMAT N COMMA6. PERCENT F7.3;
RUN;

 O /\

Global
Statements

RICK ASTER

SAS

BOOKS

Tech | Dictionary

Download | Rastinate

Rick Aster

Professional SAS Programming Shortcuts

Contents/Online Files

Corrections

Catalog Page