如果单词中存在,则将其排除在一系列单词中



考虑到所有出现的代码,如何删除常用单词?

例如,如果该单词来自前100个英语单词,那么

如果您根据Wikipedia拿出最常见的100个单词,则如何将它们添加到数组中并检查以不计数列表:https://en.wikipedia.org/wiki/most_common_words_in_en_english

阵列形式的前100个最常见的单词:

#define NUMBER_OF_STRING 100
#define MAX_STRING_SIZE   50
char commonWords[NUMBER_OF_STRING][MAX_STRING_SIZE] = {"the", "be", "to", "of", "and", "a", "in", "that", "have", "I", "it", "for", "not", "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from", "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would", "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which", "go", "me", "when", "make", "can", "like", "time", "no", "just", "him", "know", "take", "people", "into", "year", "your", "good", "some", "could", "them", "see", "other", "than", "then", "now", "look", "only", "come", "its", "over", "think", "also", "back", "after", "use", "two", "how", "our", "work", "first", "well", "way", "even", "new", "want", "because", "any", "these", "give", "day", "most", "us"};

代码示例:

/**
 * C program to count occurrences of all words in a file.
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <limits.h>
#define MAX_WORD  20000     /* max word size */
#define MAX_WORDS     8     /* initial number of struct to allocate */
#ifndef PATH_MAX
#define PATH_MAX   2048     /* max path (defined for Linux in limits.h) */
#endif
typedef struct {            /* use a struct to hold */
    char word[MAX_WORD];    /* lowercase word, and */
    int cap, count;         /* if it appeast capitalized, and its count */
} words_t;
char *strlwr (char *str)    /* no need for unsigned char */
{
    char *p = str;
    while (*p) {
        *p = tolower(*p);
        p++;
    }
    return str;
}
int main (void) {
    FILE *fptr;
    char path[PATH_MAX], word[MAX_WORD];
    size_t i, len, index = 0, max_words = MAX_WORDS;
    /* pointer to allocated block of max_words struct initialized zero */
    words_t *words = calloc (max_words, sizeof *words);
    if (!words) {   /* valdiate every allocation */
        perror ("calloc-words");
        exit (EXIT_FAILURE);
    }
    /* Input file path */
    printf ("Enter file path: ");
    if (scanf ("%s", path) != 1) {  /* validate every input */
        fputs ("error: invalid file path or cancellation.n", stderr);
        return 1;
    }
    fptr = fopen (path, "r");   /* open file */
    if (fptr == NULL) {         /* validate file open */
        fputs ( "Unable to open file.n"
                "Please check you have read privileges.n", stderr);
        exit (EXIT_FAILURE);
    }
    while (fscanf (fptr, "%s", word) == 1) {  /* while valid word read */
        int iscap = 0, isunique = 1;    /* is captial, is unique flags */
        if (isupper (*word))            /* is the word uppercase */
            iscap = 1;
        /* remove all trailing punctuation characters */
        len = strlen (word);                    /* get length */
        while (len && ispunct(word[len - 1]))   /* only if len > 0 */
            word[--len] = 0;
        strlwr (word);                  /* convert word to lowercase */
        /* check if word exits in list of all distinct words */
        for (i = 0; i < index; i++) {
            if (strcmp(words[i].word, word) == 0) {
                isunique = 0;               /* set unique flag zero */
                if (iscap)                  /* if capital flag set */
                    words[i].cap = iscap;   /* set capital flag in struct */
                words[i].count++;           /* increment word count */
                break;                      /* bail - done */
            }
        }
        if (isunique) { /* if unique, add to array, increment index */
            if (index == max_words) {       /* is realloc needed? */
                /* always use a temporary pointer with realloc */
                void *tmp = realloc (words, 2 * max_words * sizeof *words);
                if (!tmp) { /* validate every allocation */
                    perror ("realloc-words");
                    break;  /* don't exit, original data still valid */
                }
                words = tmp;    /* assign reallocated block to words */
                /* (optional) set all new memory to zero */
                memset (words + max_words, 0, max_words * sizeof *words);
                max_words *= 2; /* update max_words to reflect new limit */
            }
            memcpy (words[index].word, word, len + 1);  /* have len */
            if (iscap)                      /* if cap flag set */
                words[index].cap = iscap;   /* set capital flag in struct */
            words[index++].count++;         /* increment count & index */
        }
    }
    fclose (fptr);  /* close file */
    /*
     * Print occurrences of all words in file.
     */
    puts ("nOccurrences of all distinct words with Cap in file:");
    for (i = 0; i < index; i++) {
        if (words[i].cap) {
            strcpy (word, words[i].word);
            *word = toupper (*word);
            /*
             * %-15s prints string in 15 character width.
             * - is used to print string left align inside
             * 15 character width space.
             */
            printf("%-8d %sn", words[i].count, word);
        }
    }
    free (words);
    return 0;
}

文本文件进行测试:(cars.txt(

A car (or automobile) is a wheeled motor vehicle used for transportation. Most definitions of car say they run primarily on roads, seat one to eight people, have four tires, and mainly transport people rather than goods.[2][3]
Cars came into global use during the 20th century, and developed economies depend on them. The year 1886 is regarded as the birth year of the modern car when German inventor Karl Benz patented his Benz Patent-Motorwagen. Cars became widely available in the early 20th century. One of the first cars accessible to the masses was the 1908 Model T, an American car manufactured by the Ford Motor Company. Cars were rapidly adopted in the US, where they replaced animal-drawn carriages and carts, but took much longer to be accepted in Western Europe and other parts of the world.
Cars have controls for driving, parking, passenger comfort, and a variety of lights. Over the decades, additional features and controls have been added to vehicles, making them progressively more complex. These include rear reversing cameras, air conditioning, navigation systems, and in-car entertainment. Most cars in use in the 2010s are propelled by an internal combustion engine, fueled by the combustion of fossil fuels. Electric cars, which were invented early in the history of the car, began to become commercially available in 2008.
There are costs and benefits to car use. The costs include acquiring the vehicle, interest payments (if the car is financed), repairs and maintenance, fuel, depreciation, driving time, parking fees, taxes, and insurance.[4] The costs to society include maintaining roads, land use, road congestion, air pollution, public health, health care, and disposing of the vehicle at the end of its life. Road traffic accidents are the largest cause of injury-related deaths worldwide.[5]
The benefits include on-demand transportation, mobility, independence, and convenience.[6] The societal benefits include economic benefits, such as job and wealth creation from the automotive industry, transportation provision, societal well-being from leisure and travel opportunities, and revenue generation from the taxes. People's ability to move flexibly from place to place has far-reaching implications for the nature of societies.[7] There are around 1 billion cars in use worldwide. The numbers are increasing rapidly, especially in China, India and other newly industrialized countries.[8]

当前输出:

Occurrences of all distinct words with Cap in file:
3        A
2        Motor
2        Most
2        One
8        Cars
29       The
1        German
1        Karl
2        Benz
1        Patent-motorwagen
1        Model
1        T
1        American
1        Ford
1        Company
1        Us
1        Western
1        Europe
1        Over
1        These
1        Electric
2        There
2        Road
1        People's
1        China
1        India

预期输出:(仅示例(

2        Motor
1        German
1        Karl
2        Benz
1        Patent-motorwagen
1        Model
1        T
1        American
1        Ford
1        Company

编辑更新:可能的解决方案:

  • 以及继续(不起作用(
    // skip the word if it is a common word
    for (int i = 0; i < NUMBER_OF_STRING; i++) {
        if (strcmp(word, commonWords[i])==0) {
            continue;
        }
    }
    
  • 一种更有效的方法是使用单个呼叫strstr,而不是尝试与前100个最常见的单词中的每个单词中的每一个进行比较。由于您知道100个最常见的单词,而且它们不会改变,因此您可以轻松确定最长的7个字符。换句话说,您只需要测试word是否比:

    少于: 是最常见的之一
    #define TOP_LEN       8     /* longest string in TOP100 + nul-character */
    

    由于单词不变,您可以继续前进,

    const char TOP100[] = " the be to of and a in that have i it for not on with"
                    " he as you do at this but his by from they we say her she or"
                    " an will my one all would there their what so up out if about"
                    " who get which go me when make can like time no just him know"
                    " take people into year your good some could them see other"
                    " than then now look only come its over think also back after"
                    " use two how our work first well way even new want because"
                    " any these give day most us ";
    

    (注意:之前的 space和每个单词之后的 space,允许您创建一个teststr,通过在单词的任一侧包含一个空间来搜索strstrstrlwr (word);之后的小写小写(

    (也要注意:您也可以使用#define TOP100 " the ... us "使用常数字面的,但是它会在此处包装和滚动,并在此处滚动 - 取决于您(

    使用您的100个最常见单词的恒定字符串,唯一需要的添加是:

            ...
            strlwr (word);                  /* convert word to lowercase */
            /* check against 100 most common words (TOP100) */
            if (len < TOP_LEN) {                    /* word less than TOP_LEN? */
                char teststr[TOP_LEN * 2];          /* buffer for " word " */
                sprintf (teststr, " %s ", word);    /* create teststr */
                if (strstr (TOP100, teststr))       /* check if in TOP100 */
                    continue;                       /* if so, get next word */
            }
            ...
    

    您在上面看到,您检查该单词是否为7个字符或更小(否则无需检查最常见的单词(。然后,您声明teststr以将您的字符串保持在两端的空间。(由于7-Char中最长的常见单词,然后7-Char Plus 2个空间为9-Char,加上NUL-Character是 10,因此16-Char在这里足够。(

    sprintf进行的简单调用是将空格放在word的每一端所需的全部,然后需要对strstr进行单个调用,以查看word是否在前100个最常见的单词中。如果是这样,则无需走得更远,只需 continue即可获取下一个单词。

    将其完全放入您的代码中:

    /**
     * C program to count occurrences of all words in a file.
     */
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <ctype.h>
    #include <limits.h>
    #define MAX_WORD  20000     /* max word size */
    #define MAX_WORDS     8     /* initial number of struct to allocate */
    #define TOP_LEN       8     /* longest string in TOP100 */
    #ifndef PATH_MAX
    #define PATH_MAX   2048     /* max path (defined for Linux in limits.h) */
    #endif
    const char TOP100[] = " the be to of and a in that have i it for not on with"
                    " he as you do at this but his by from they we say her she or"
                    " an will my one all would there their what so up out if about"
                    " who get which go me when make can like time no just him know"
                    " take people into year your good some could them see other"
                    " than then now look only come its over think also back after"
                    " use two how our work first well way even new want because"
                    " any these give day most us ";
    typedef struct {            /* use a struct to hold */
        char word[MAX_WORD];    /* lowercase word, and */
        int cap, count;         /* if it appeast capitalized, and its count */
    } words_t;
    char *strlwr (char *str)    /* no need for unsigned char */
    {
        char *p = str;
        while (*p) {
            *p = tolower(*p);
            p++;
        }
        return str;
    }
    int main (void) {
        FILE *fptr;
        char path[PATH_MAX], word[MAX_WORD];
        size_t i, len, index = 0, max_words = MAX_WORDS;
        /* pointer to allocated block of max_words struct initialized zero */
        words_t *words = calloc (max_words, sizeof *words);
        if (!words) {   /* valdiate every allocation */
            perror ("calloc-words");
            exit (EXIT_FAILURE);
        }
        /* Input file path */
        printf ("Enter file path: ");
        if (scanf ("%s", path) != 1) {  /* validate every input */
            fputs ("error: invalid file path or cancellation.n", stderr);
            return 1;
        }
        fptr = fopen (path, "r");   /* open file */
        if (fptr == NULL) {         /* validate file open */
            fputs ( "Unable to open file.n"
                    "Please check you have read privileges.n", stderr);
            exit (EXIT_FAILURE);
        }
        while (fscanf (fptr, "%s", word) == 1) {  /* while valid word read */
            int iscap = 0, isunique = 1;    /* is captial, is unique flags */
            if (isupper (*word))            /* is the word uppercase */
                iscap = 1;
            /* remove all trailing punctuation characters */
            len = strlen (word);                    /* get length */
            while (len && ispunct(word[len - 1]))   /* only if len > 0 */
                word[--len] = 0;
            strlwr (word);                  /* convert word to lowercase */
            /* check against 100 most common words (TOP100) */
            if (len < TOP_LEN) {                    /* word less than TOP_LEN? */
                char teststr[TOP_LEN * 2];          /* buffer for " word " */
                sprintf (teststr, " %s ", word);    /* create teststr */
                if (strstr (TOP100, teststr))       /* check if in TOP100 */
                    continue;                       /* if so, get next word */
            }
            /* check if word exits in list of all distinct words */
            for (i = 0; i < index; i++) {
                if (strcmp(words[i].word, word) == 0) {
                    isunique = 0;               /* set unique flag zero */
                    if (iscap)                  /* if capital flag set */
                        words[i].cap = iscap;   /* set capital flag in struct */
                    words[i].count++;           /* increment word count */
                    break;                      /* bail - done */
                }
            }
            if (isunique) { /* if unique, add to array, increment index */
                if (index == max_words) {       /* is realloc needed? */
                    /* always use a temporary pointer with realloc */
                    void *tmp = realloc (words, 2 * max_words * sizeof *words);
                    if (!tmp) { /* validate every allocation */
                        perror ("realloc-words");
                        break;  /* don't exit, original data still valid */
                    }
                    words = tmp;    /* assign reallocated block to words */
                    /* (optional) set all new memory to zero */
                    memset (words + max_words, 0, max_words * sizeof *words);
                    max_words *= 2; /* update max_words to reflect new limit */
                }
                memcpy (words[index].word, word, len + 1);  /* have len */
                if (iscap)                      /* if cap flag set */
                    words[index].cap = iscap;   /* set capital flag in struct */
                words[index++].count++;         /* increment count & index */
            }
        }
        fclose (fptr);  /* close file */
        /*
         * Print occurrences of all words in file.
         */
        puts ("nOccurrences of all distinct words with Cap in file:");
        for (i = 0; i < index; i++) {
            if (words[i].cap) {
                strcpy (word, words[i].word);
                *word = toupper (*word);
                /*
                 * %-15s prints string in 15 character width.
                 * - is used to print string left align inside
                 * 15 character width space.
                 */
                printf("%-8d %sn", words[i].count, word);
            }
        }
        free (words);
        return 0;
    }
    

    示例使用/输出

    就像上次是这种情况一样,您的预期输出:(仅(是错误的,因为您的代码中没有任何内容可以删除 plurals /em>或复数所有人,因此您使用cars.txt文件的输出将是:

    $ ./bin/unique_words_exclude_top_100
    Enter file path: dat/cars.txt
    Occurrences of all distinct words with Cap in file:
    2        Motor
    8        Cars
    1        German
    1        Karl
    2        Benz
    1        Patent-motorwagen
    1        Model
    1        T
    1        American
    1        Ford
    1        Company
    1        Western
    1        Europe
    1        Electric
    2        Road
    1        People's
    1        China
    1        India
    

    查看事物,让我知道您是否还有其他问题。

    在将word添加到words列表中之前,请过滤common word。我使核能函数如下:

    int isCommonWord(char * word)
    {
        int i = 0;
        for (i = 0; i < NUMBER_OF_STRING; i++) {
            if (strcmp(commonWords[i], word) == 0) return 1;
        }
        return 0;
    }
    

    ,然后在添加到单词数组之前过滤掉单词。请参阅代码的第二行,我对以下内容进行了修改:

    if (isunique) { /* if unique, add to array, increment index */
        if (!isCommonWord(word)) {
            if (index == max_words) {       /* is realloc needed? */
                /* always use a temporary pointer with realloc */
                void *tmp = realloc(words, 2 * max_words * sizeof *words);
                if (!tmp) { /* validate every allocation */
                    perror("realloc-words");
                    break;  /* don't exit, original data still valid */
                }
                words = (words_t *)tmp;    /* assign reallocated block to words */
                /* (optional) set all new memory to zero */
                memset(words + max_words, 0, max_words * sizeof *words);
                max_words *= 2; /* update max_words to reflect new limit */
            }
            memcpy(words[index].word, word, len + 1);  /* have len */
            if (iscap)                      /* if cap flag set */
                words[index].cap = iscap;   /* set capital flag in struct */
            words[index++].count++;         /* increment count & index */
        }
    }
    

    我认为结果是正确的:

    Enter file path: cars.txt
    Occurrences of all distinct words with Cap in file:
    2        Motor
    8        Cars
    1        German
    1        Karl
    2        Benz
    1        Patent-motorwagen
    1        Model
    1        T
    1        American
    1        Ford
    1        Company
    1        Western
    1        Europe
    1        Electric
    2        Road
    1        People's
    1        China
    1        India
    

    这显然是行不通的,因为在误导性评论中,它不是一个常见的单词,而是跳过这个词,而是跳过电流迭代,然后继续在"通用单词列表"中的下一个单词

    中检查
    // skip the word if it is a common word
    for (int i = 0; i < NUMBER_OF_STRING; i++) {
        if (strcmp(word, commonWords[i])==0) {
            continue;
        }
    }
    

    continue只会影响最内向的循环。此外,循环之后,没有任何更改

    要修复您需要打破外循环

    nextword:
    while (fscanf (fptr, "%s", word) == 1) // read the word
        for (int i = 0; i < NUMBER_OF_STRING; i++) {
            if (strcmp(word, commonWords[i])==0) {
                goto nextword; // skip current word
            }
        }
    /// ...
    }
    

    或如果您不想使用goto,则必须使用另一个变量

    int isCommonWord = 0;
    while (fscanf (fptr, "%s", word) == 1) // read the word
        for (int i = 0; i < NUMBER_OF_STRING; i++) {
            if (strcmp(word, commonWords[i])==0) {
                isCommonWord = 1;
                break; // exit the for loop
            }
        }
        if (isCommonWord)
            continue;  // get the next word
    /// ...
    }
    

    无论如何,您的实施效率很低。这基本上是一本词典,该字典将字符串(单词(映射到整数(这是单词计数(。可以对字典进行排序(例如C 中的std::map(或基于哈希(std::unordered_map(。由于您没有对数组进行排序,因此您总是必须穿越整个列表。如果对数组进行排序,则使用二进制搜索将大大减少查找。要查看128个元素的列表,您最多只需要7个比较,而不是128个元素。

    但是,在查找字典中的单词之前,您需要检查该单词是否首先是常见的。这是通过检查单词是否存在于通用单词集中来完成的。同样,该集合可以实现未分类(慢(,分类(更好,C 中的std::set(或基于哈希(Hash((最快但需要更多内存,std::unordered_set,C 中的CC_25(。集合和字典之间的区别在于,每个字典条目都包含一对(键,值(,而值也是集合中的键。for循环检查上面的strcmp(word, commonWords[i])==0是一个简单的遍历。无论如何,一旦您在集合中找到单词后,请跳过当前的while循环, for循环如上上文所说。该工作将工作

    继续外部循环是推荐goto的情况。

    在此之前添加标签:

    outer:
    while (fscanf (fptr, "%s", word) == 1)  { ... }
    

    并将问题中的可能解决方案更改为:

    for (int i = 0; i < NUMBER_OF_STRING; i++) {
        if (strcmp(word, commonWords[i])==0) {
            goto outer;
        }
    }
    

    在您当前的解决方案中,continue只是继续内部循环。

    编辑


    基于您的程序,修改程序应如下工作:

    .
    .
    .
    outer:
    while (fscanf (fptr, "%s", word) == 1) {
        .
        .
        .
        strlwr(word);
        for (int i = 0; i < NUMBER_OF_STRING; i++) {
            if (strcmp(word, commonWords[i])==0) {
                goto outer;
            }
        }
        .
        .
        .
    }
    .
    .
    .
    

    一个功能看起来像:

    int isCommon(char *word) {
        for (int i = 0; i < NUMBER_OF_STRING; i++) {
            if (strcmp(word, commonWords[i])==0) {
                return 1;
            }
        }
        return 0;
    }
    int main() {
        .
        .
        .
        while (fscanf (fptr, "%s", word) == 1) {
            .
            .
            .
            strlwr(word);
            if(isCommon(word))
                continue;
            .
            .
            .
        }
        .
        .
        .
    }
    

    请注意,如果使用此功能,您将不再需要goto;一个简单的continue就足够了。

    相关内容

    • 没有找到相关文章

    最新更新