20061015

File Character Frequency

In my machine learning course, our first project was to write a program to sort e-mail into nine class labels A through I. In my implementation after every e-mail was classified, I would write a single character to the screen. There were several thousand of e-mails that needed sorting, so I needed a program that displayed a final tally of the results.

I could have easily built something into the program I was writing, but I thought something more general purpose would be useful. I quickly wrote the following code. It takes a tally of each character in a file and then uses a quick sort to sort the data and then prints the results. I even added a switch to ignore whitespace characters.

#include <stdio.h>

/* Author:      James Church
 * Date:        09/11/06
 * Program:     charfreq
 * Version:     0.3
 * Description: Takes a single file from the command line as input and
 *              reports the character frequence of each character in order
 *              from most common to least common.
 *
 * Copyright © 2006 Free Software Foundation, Inc.
 *
 * Copying and distribution of this file, with or without modification,
 * are permitted in any medium without royalty provided the copyright
 * notice and this notice are preserved.
 */

#define CHARRANGE 256

typedef struct _charcount {
    unsigned char c;
    long          count;
} CharCount;

void quicksort (CharCount *a, int i, int j);
int  partition (CharCount *a, int i, int j);

int main(int argc, char **argv) {
    int            i;
    unsigned char  c;
    CharCount      freq[CHARRANGE];
    FILE          *file;
    int           ignore_whitespace = 0;
    int           argparse = 1;

    if (argc == 1) {
        printf("\nUsage: %s [-s] [filename] - Reports character frequence of file.\n", argv[0]);
        printf(" -s  Ignores whitespace (optional)\n");
        return 0;
    } // End If

    if (strcmp("-s", argv[argparse]) == 0) {
        ignore_whitespace = 1;
        argparse++;
    } // End If

    if ((file = fopen(argv[argparse], "r")) == NULL) {
        printf("Error: Cannot open %s\n", argv[1]);
        return 0;
    } // End If
    argparse++;

    for (i = 0; i < CHARRANGE; i++) {
        freq[i].c     = (unsigned char) i;
        freq[i].count = 0;
    } // End For

    while (1) {
        fread (&c, sizeof(unsigned char), 1, file);
        if (feof(file)) break;

        if (ignore_whitespace && (c == ' ' || c == '\t' || c == '\r' || c == '\n'))
            continue;

        freq[c].count++;
    } // End While

    quicksort(freq, 0, CHARRANGE-1);

    for (i = CHARRANGE-1; i >= 0; i--) {
        if (freq[i].count == 0) break;

        printf("%c[%3d]: %d\n", freq[i].c, freq[i].c, freq[i].count);
    } // End For

    fclose(file);
    return 0;
} // End Main

void quicksort (CharCount *a, int i, int j) {
    int p;

    if (i < j) {
        p = partition (a, i, j);
        quicksort (a, i, p-1);
        quicksort (a, p+1, j);
    } /* End If */
} /* End mergesort */

int partition (CharCount *a, int i, int j) {
    int val = a[i].count;
    int h   = i;
    int k;
    CharCount temp;

    for (k = i+1; k <= j; k++)
        if (a[k].count < val) {
            h++;
            temp = a[h];
            a[h] = a[k];
            a[k] = temp;
        } /* End If */

    temp = a[i];
    a[i] = a[h];
    a[h] = temp;
    return h;
} /* End partition */

Enjoy!

No comments: