我有以下程序,它从文本文件中读取单词并创建一个链表,每个节点包含:word,count,next。
当链表中已存在单词时,计数将更新,否则,将在链表的末尾创建一个节点。
我所有的函数都有效,除了我在链表末尾添加一个单词的函数。节点链接可能存在错误?
使用我的以下文本文件:第 1 行"我的名字是娜塔莉",第 2 行"我的狗是尼科" 我应该得到以下输出:my(2),name(1),is(2),natalie(1),dog(1),niko(1) 但我得到:我的(2),狗(2),s(1),iko(1),is(1),niko(1)
我的错误在哪里?
//function to add word to linked list
struct WordNode *addWord(char* word, struct WordNode *wordListHead){
//create new node
struct WordNode *current = malloc(sizeof(struct WordNode));
current->word = word;
current->count = 1;
current->next = NULL;
//
while(1){
if((wordListHead != NULL)&&(wordListHead->next ==NULL)){
//connect
wordListHead->next=current;
break;
}
wordListHead = wordListHead->next;
}
}
在主线中调用:
char *filename = argv[1];
FILE *fp = fopen(filename, "r");
printf("%sn", filename);
if (fp == NULL){
printf("Error: unable to open the file ");
printf("%s", filename);
return 1;
}
else {
char *delim = " tn,:;'".?!#$-><(){}[]|\/*&^%@!~+=_"; // These are our word delimiters.
char line[1000];
char * token;
int count = 0;//count used so that first word of the doc can be created as head node
//create head pointer
struct WordNode *wordListHead = NULL;
//create current pointer
struct WordNode *current = NULL;
//iterate through each line in file
while(fgets(line, 1000, fp)){
//seperate each word
//first word of line
token = strtok(line, delim);
printf("%sn", token);
if(count == 0){
//first word of document, create first wordnode (head node)
wordListHead = malloc(sizeof(struct WordNode));
wordListHead->word = token;
wordListHead->count = 1;
wordListHead->next = NULL;
}
else{
//check if first word of line exists in linked list
if((doesWordExist(token, wordListHead)) == 0){
//update count
updateCount(token, wordListHead);
}
else{
//create node
addWord(token, wordListHead);
}
}
//iterate through all the other words of line
while ((token=strtok(NULL, delim)) != NULL){
printf("%sn", token);
//check if name is in linked list
if((doesWordExist(token, wordListHead)) == 0){
//update count
updateCount(token, wordListHead);
}
else{
//create node
addWord(token, wordListHead);
}
}
count++;
}
printWordList(wordListHead);
}
}
此处定义的结构:
//structure definition of linked list node
#ifndef WORDLISTH
#define WORDLISTH
struct WordNode{
char *word;
unsigned long count;
struct WordNode *next;
};
void printWordList( struct WordNode *wordListHead);
struct WordNode *addWord(char* word , struct WordNode *wordListead);
#endif
其他供参考的功能:
//function to check if word is in linked list
bool doesWordExist(char* myword, struct WordNode *wordListHead){
while (wordListHead != NULL){
if(strcmp(wordListHead->word, myword) == 0){
return 0;
}
wordListHead= wordListHead-> next;
}
return 1;
}
//function to update the count of word
void updateCount(char* myword, struct WordNode *wordListHead){
while (wordListHead != NULL){
if(strcmp(wordListHead->word, myword) == 0){
//update count value
//capture current count and add 1
int curcount = wordListHead->count;
int newcount = curcount + 1;
wordListHead->count = newcount;
//printf("%dn", newcount);
}
wordListHead= wordListHead-> next;
}
}
//function to print word list
//takes head node as argument
void printWordList( struct WordNode *wordListHead){
//WordNode *toyptr = wordListHead;
while (wordListHead != NULL){
printf("%sn", wordListHead->word);
printf("%ldn", wordListHead->count);
wordListHead = wordListHead -> next;
}
}
将token
存储到结构中时,使用的是作为输入缓冲区一部分的指针。
在新的输入行上,从前几行收集的令牌将被损坏/丢弃。
您需要分配空间以将令牌存储在堆上的结构中。为此使用strdup
。
因此,在addWord
中,您需要:
current->word = strdup(word);
在main
中,您需要:
wordListHead->word = strdup(token);
>更新:这是主要问题。但是,您的代码会执行大量不必要的复制。
addWord
不会处理空列表。但是,如果是这样,main
就不需要为行上的第一个单词和后续单词使用单独的[复制]代码。
strcmp
可以合并到addWord
中,并且可以"做所有事情"。(即列表的单次扫描)
对于doesWordExist
,它返回一个匹配项的bool
。如果它返回指向匹配元素的指针,则updateCount
只需增加计数 [而不是重新扫描列表]。我已经相应地更新了这些功能,但由于对addWord
的更改,不再需要它们
以下是我简化和重构代码的方法:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef int bool;
#ifdef DEBUG
#define dbgprt(_fmt...)
printf(_fmt)
#else
#define dbgprt(_fmt...) do { } while (0)
#endif
//structure definition of linked list node
#ifndef WORDLISTH
#define WORDLISTH
struct WordNode {
char *word;
unsigned long count;
struct WordNode *next;
};
void printWordList(struct WordNode *wordListHead);
#endif
//function to add word to linked list
struct WordNode *
addWord(char *word, struct WordNode **list)
{
struct WordNode *curr;
struct WordNode *prev = NULL;
struct WordNode *newnode = NULL;
for (curr = *list; curr != NULL; curr = curr->next) {
if (strcmp(curr->word,word) == 0) {
newnode = curr;
break;
}
prev = curr;
}
// create new node
do {
// word already exists
if (newnode != NULL)
break;
newnode = malloc(sizeof(struct WordNode));
newnode->word = strdup(word);
newnode->count = 0;
newnode->next = NULL;
// attach to tail of list
if (prev != NULL) {
prev->next = newnode;
break;
}
// first node -- set list pointer
*list = newnode;
} while (0);
// increment the count
newnode->count += 1;
return newnode;
}
//function to check if word is in linked list
struct WordNode *
findWord(char *myword, struct WordNode *head)
{
struct WordNode *curr;
for (curr = head; curr != NULL; curr = curr->next) {
if (strcmp(curr->word,myword) == 0)
break;
}
return curr;
}
//function to update the count of word
void
updateCount(char *myword, struct WordNode *head)
{
struct WordNode *match;
match = findWord(myword,head);
if (match != NULL)
match->count += 1;
}
//function to print word list
//takes head node as argument
void
printWordList(struct WordNode *head)
{
struct WordNode *curr;
for (curr = head; curr != NULL; curr = curr->next) {
printf("%s", curr->word);
printf(" %ldn", curr->count);
}
}
int
main(int argc, char **argv)
{
char *filename = argv[1];
FILE *fp = fopen(filename, "r");
printf("FILE: %sn", filename);
if (fp == NULL) {
printf("Error: unable to open the file ");
printf("%s", filename);
return 1;
}
// These are our word delimiters.
char *delim = " tn,:;'".?!#$-><(){}[]|\/*&^%@!~+=_";
char line[1000];
char *token;
// create head pointer
struct WordNode *wordListHead = NULL;
// iterate through each line in file
while (fgets(line, sizeof(line), fp)) {
// seperate each word
// first word of line
char *bp = line;
while (1) {
token = strtok(bp, delim);
bp = NULL;
if (token == NULL)
break;
dbgprt("TOKEN1: %sn", token);
addWord(token,&wordListHead);
}
}
printWordList(wordListHead);
return 0;
}
<小时 />更新#2:
请注意,addWord
和findWord
复制代码。addWord
的第一部分[基本上]是复制findWord
所做的事情。
但是,addWord
不能只使用findWord
[这是可取的],因为findWord
,如果它找不到匹配返回NULL
。在这种情况下,它[没有办法]传回addWord
需要附加到的最后一个元素(即列表的"尾巴")。
虽然我们可以添加一个额外的参数来findWord
传播这个值,但更好的解决方案是创建一个定义"列表"的不同结构。
在现有代码中,我们使用指向头字节点的"双星"指针作为"列表"。使用单独的结构更干净,并且具有一些额外的优点。
我们可以传递一个简单的指向列表的指针。我们不再需要担心我们是否应该传递双星指针。
虽然我们只使用单链表,但如果我们希望将列表转换为双向链表,单独的列表结构会有所帮助。
我们只需传递一个指向列表的指针,列表就可以跟踪列表的头部、列表的尾部以及列表中元素数量的计数。
链表非常适合与mergesort
一起排序。列表计数有助于提高效率,因为更容易找到列表的"中点"[合并排序需要知道]。
为了显示双向链表的开头,我在单词 struct 中添加了一个prev
元素。目前未使用,但它暗示了双重链接版本。
我重新设计了所有函数,以获取指向列表的指针,而不是指向头节点的指针。
由于列表结构具有tail
指针,因此addWord
现在可以调用findWord
。如果findWord
找不到匹配项,addWord
只需使用列表中的head/tail
指针即可找到正确的插入点。
为了简化一点,我更改了单词节点 [和列表结构] 以使用一些typedef
语句。
此外,将主导结构指针作为第一个参数更常见/惯用,因此我颠倒了某些函数的参数顺序。
无论如何,这是[进一步]重构的代码:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef int bool;
#ifdef DEBUG
#define dbgprt(_fmt...)
printf(_fmt)
#else
#define dbgprt(_fmt...) do { } while (0)
#endif
//structure definition of linked list node
#ifndef WORDLISTH
#define WORDLISTH
// word frequency control
typedef struct WordNode Word_t;
struct WordNode {
const char *word;
unsigned long count;
Word_t *next;
Word_t *prev;
};
// word list control
typedef struct {
Word_t *head;
Word_t *tail;
unsigned long count;
} List_t;
void printWordList(List_t *list);
#endif
// create a list
List_t *
newList(void)
{
List_t *list;
list = calloc(1,sizeof(*list));
return list;
}
// function to check if word is in linked list
Word_t *
findWord(List_t *list,const char *myword)
{
Word_t *curr;
for (curr = list->head; curr != NULL; curr = curr->next) {
if (strcmp(curr->word,myword) == 0)
break;
}
return curr;
}
//function to add word to linked list
Word_t *
addWord(List_t *list,const char *word)
{
Word_t *match;
do {
// try to find existing word
match = findWord(list,word);
// word already exists
if (match != NULL)
break;
// create new node
match = malloc(sizeof(*match));
match->word = strdup(word);
match->count = 0;
match->next = NULL;
// attach to head of list
if (list->head == NULL)
list->head = match;
// append to tail of list
else
list->tail->next = match;
// set new tail of list
list->tail = match;
// advance list count
list->count += 1;
} while (0);
// increment the word frequency count
match->count += 1;
return match;
}
//function to update the count of word
void
updateCount(List_t *list,const char *myword)
{
Word_t *match;
match = findWord(list,myword);
if (match != NULL)
match->count += 1;
}
//function to print word list
//takes head node as argument
void
printWordList(List_t *list)
{
Word_t *curr;
for (curr = list->head; curr != NULL; curr = curr->next) {
printf("%s", curr->word);
printf(" %ldn", curr->count);
}
}
int
main(int argc, char **argv)
{
char *filename = argv[1];
FILE *fp = fopen(filename, "r");
printf("FILE: %sn", filename);
if (fp == NULL) {
printf("Error: unable to open the file ");
printf("%s", filename);
return 1;
}
// These are our word delimiters.
char *delim = " tn,:;'".?!#$-><(){}[]|\/*&^%@!~+=_";
char line[1000];
char *token;
// create list/head pointer
List_t *list = newList();
// iterate through each line in file
while (fgets(line, sizeof(line), fp) != NULL) {
// seperate each word
// first word of line
char *bp = line;
while (1) {
token = strtok(bp, delim);
bp = NULL;
if (token == NULL)
break;
dbgprt("TOKEN1: %sn", token);
addWord(list,token);
}
}
printWordList(list);
return 0;
}
@Craig Estey 为您提供了一个很好的答案,所以不要更改您的答案选择,但不要只是在评论中留下链接,有几种查看列表操作的重要方法可能会有所帮助,您必须使用内存/错误检查程序来验证您对分配内存的使用, 尤其是在处理列表操作时。
假设一个节点持有一个字符串,其引用计数为字符串出现的额外次数,就像你的情况一样,例如
typedef struct node_t { /* list node */
char *s;
size_t refcnt;
struct node_t *next;
} node_t;
使用节点地址和指向节点的指针进行迭代消除了特殊情况
使用节点的地址和指向节点的指针在Linus的了解指针中进行了讨论。
例如,如果需要检查if (list->head == NULL)
以将第一个节点添加到列表中,如果同时使用节点的地址和指向节点的指针进行迭代,则只需将分配给新节点的指针分配给当前节点的地址即可。无论它是第一个、中间还是最后一个节点,这都有效。它还消除了从列表中删除节点时不必担心前一个节点是什么。在要删除的节点上,您只需将下一个节点的内容分配给当前地址并释放最初存在的节点。这会将您的添加节点函数减少到类似于以下内容的内容:
/** add node in sort order to list.
* returns pointer to new node on success, NULL otherwise.
*/
node_t *add_node (node_t **head, const char *s)
{
node_t **ppn = head, /* address of current node */
*pn = *head; /* pointer to current node */
while (pn) { /* iterate to last node */
if (strcmp (pn->s, s) == 0) { /* compare node string with string */
pn->refcnt += 1; /* increment ref count */
return *ppn;
}
ppn = &pn->next; /* ppn to address of next */
pn = pn->next; /* advance pointer to next */
}
return *ppn = create_node (s); /* allocate and return node */
}
(注意:通过延迟新节点和字符串的分配(create_node (s)
上面),您可以避免分配,直到您知道需要添加字符串 - 简化内存处理)
如上面的注释中所述,这会将列表的doesWordExist()
遍历和addWord()
遍历结合起来,以找到结束的遍历到单个遍历中。如果列表中有数十万个节点,则不希望多次遍历列表。
使用strdup()
很好,但要知道它是 POSIX 而不是标准 C
strdup()
对于复制字符串并将结果分配给新指针很方便,但strdup()
由 POSIX 提供,因此并非所有实现都会提供它。此外,strdup()
分配内存,因此与分配内存的任何函数一样,在使用返回的指针之前,必须检查结果是否未NULL
。您可以通过编写简短的等效项来避免潜在的可移植性问题。例如,在上面显示的create_node()
中,它确实:
/** helper function to allocate node, and storage for string.
* copies string to node and initializes node->next pointer NULL
* and node->refcnt zero. returns pointer to allocated node on success,
* NULL otherwise.
*/
node_t *create_node (const char *s)
{
size_t len = strlen (s); /* get string length */
node_t *node = malloc (sizeof *node); /* allocate node */
if (!node) { /* validate EVERY allocation */
perror ("create_node: malloc node");
return NULL;
}
if (!(node->s = malloc (len + 1))) { /* allocate for string */
perror ("create_node: malloc node->s");
free (node); /* on failure, free node before returning NULL */
return NULL;
}
memcpy (node->s, s, len+1); /* copy string to node */
node->refcnt = 0; /* initialize ref count */
node->next = NULL; /* initialze next NULL */
return node; /* return allocated & initialized node */
}
释放分配的内存
每当您编写创建数据结构的代码时,它们都需要能够在您要删除单个节点时自行清理,或者使用该列表完成。当您创建仅在以下函数中声明和使用的列表等时,这变得势在必行main()
因为内存不会在函数返回时释放。使用main()
时,程序退出并释放所有分配的内存。但是,如果列表仅在main()
下面创建和使用,则每次调用该函数时,如果在返回之前未释放列表内存,则会导致内存泄漏。释放所有内存的功能简短且易于编写,例如
/** delete all nodes in list */
void del_list (node_t *head)
{
node_t *pn = head; /* pointer to iterate */
while (pn) { /* iterate over each node */
node_t *victim = pn; /* set victim to current */
pn = pn->next; /* advance pointer to next */
free (victim->s); /* free current string */
free (victim); /* free current node */
}
}
(**无需担心refcnt
,因为您要删除列表)
一个包含所有这些点的简短示例,以及一个del_node()
从列表中删除单个节点(或者如果refcnt
不为零,则减少refcnt
而不删除节点)可以是:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#define MAXC 1024 /* if you need a constant, #define one (or more) */
typedef struct node_t { /* list node */
char *s;
size_t refcnt;
struct node_t *next;
} node_t;
/** helper function to allocate node, and storage for string.
* copies string to node and initializes node->next pointer NULL
* and node->refcnt zero. returns pointer to allocated node on success,
* NULL otherwise.
*/
node_t *create_node (const char *s)
{
size_t len = strlen (s); /* get string length */
node_t *node = malloc (sizeof *node); /* allocate node */
if (!node) { /* validate EVERY allocation */
perror ("create_node: malloc node");
return NULL;
}
if (!(node->s = malloc (len + 1))) { /* allocate for string */
perror ("create_node: malloc node->s");
free (node); /* on failure, free node before returning NULL */
return NULL;
}
memcpy (node->s, s, len+1); /* copy string to node */
node->refcnt = 0; /* initialize ref count */
node->next = NULL; /* initialze next NULL */
return node; /* return allocated & initialized node */
}
/** add node in sort order to list.
* returns pointer to new node on success, NULL otherwise.
*/
node_t *add_node (node_t **head, const char *s)
{
node_t **ppn = head, /* address of current node */
*pn = *head; /* pointer to current node */
while (pn) { /* iterate to last node */
if (strcmp (pn->s, s) == 0) { /* compare node string with string */
pn->refcnt += 1; /* increment ref count */
return *ppn;
}
ppn = &pn->next; /* ppn to address of next */
pn = pn->next; /* advance pointer to next */
}
return *ppn = create_node (s); /* allocate and return node */
}
/** print all nodes in list */
void prn_list (node_t *head)
{
if (!head) { /* check if list is empty */
puts ("list-empty");
return;
}
for (node_t *pn = head; pn; pn = pn->next) /* iterate over each node */
printf ("%-24s %4zun", pn->s, pn->refcnt); /* print string an refcount */
}
/** delete node with string s from list (for loop) */
void del_node (node_t **head, const char *s)
{
node_t **ppn = head; /* address of node */
node_t *pn = *head; /* pointer to node */
for (; pn; ppn = &pn->next, pn = pn->next) {
if (strcmp (pn->s, s) == 0) { /* does string match */
if (pn->refcnt) { /* ref count > 0 */
pn->refcnt -= 1; /* decrement ref count */
return; /* done */
}
*ppn = pn->next; /* set content at address to next */
free (pn); /* free original pointer */
break;
}
}
}
/** delete all nodes in list */
void del_list (node_t *head)
{
node_t *pn = head; /* pointer to iterate */
while (pn) { /* iterate over each node */
node_t *victim = pn; /* set victim to current */
pn = pn->next; /* advance pointer to next */
free (victim->s); /* free current string */
free (victim); /* free current node */
}
}
int main (int argc, char **argv) {
char buf[MAXC]; /* read buffer */
const char *delim = " tn.,;?!()"; /* strtok delimiters */
node_t *list = NULL; /* pointer to list (must initialize NULL */
/* use filename provided as 1st argument (stdin by default) */
FILE *fp = argc > 1 ? fopen (argv[1], "r") : stdin;
if (!fp) { /* validate file open for reading */
perror ("file open failed");
return 1;
}
while (fgets (buf, MAXC, fp)) /* read all lines in file */
/* tokenize line based on delim */
for (char *p = strtok (buf, delim); p; p = strtok (NULL, delim)) {
if (ispunct(*p)) /* if word is punctionation, skip */
continue;
add_node (&list, p); /* add node or increment refcnt */
}
if (fp != stdin) /* close file if not stdin */
fclose (fp);
puts ("string refcntn" /* heading */
"-------------------------------");
prn_list (list); /* print contents of list */
del_list (list); /* free all list memory */
return 0;
}
看看delim
中包含的字符以用于strtok()
,以及使用ispunct()
跳过以标点符号开头的标记(允许使用连字符的单词,但跳过单独用作句子延续的连字符等......
示例输入文件
$ cat dat/tr_dec_3_1907.txt
No man is above the law and no man is below it;
nor do we ask any man's permission when we require him to obey it.
Obedience to the law is demanded as a right; not asked as a favor.
(Theodore Roosevelt - December 3, 1907)
示例使用/输出
$ ./bin/lls_string_nosort_refcnt dat/tr_dec_3_1907.txt
string refcnt
-------------------------------
No 0
man 1
is 2
above 0
the 1
law 1
and 0
no 0
below 0
it 1
nor 0
do 0
we 1
ask 0
any 0
man's 0
permission 0
when 0
require 0
him 0
to 1
obey 0
Obedience 0
demanded 0
as 1
a 1
right 0
not 0
asked 0
favor 0
Theodore 0
Roosevelt 0
December 0
3 0
1907 0
内存使用/错误检查
在您编写的任何动态分配内存的代码中,对于分配的任何内存块,您有 2个责任:(1)始终保留指向内存块起始地址的指针,以便 (2) 当不再需要内存块时可以释放它。
必须使用内存错误检查程序来确保不会尝试访问内存或超出/超出分配块边界的写入,不会尝试读取或基于未初始化值的条件跳转,最后确认释放了已分配的所有内存。
对于Linux来说,valgrind
是正常的选择。每个平台都有类似的内存检查器。它们都易于使用,只需通过它运行您的程序即可。
$ valgrind ./bin/lls_string_nosort_refcnt dat/tr_dec_3_1907.txt
==8528== Memcheck, a memory error detector
==8528== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al.
==8528== Using Valgrind-3.13.0 and LibVEX; rerun with -h for copyright info
==8528== Command: ./bin/lls_string_nosort_refcnt dat/tr_dec_3_1907.txt
==8528==
string refcnt
-------------------------------
No 0
man 1
is 2
above 0
the 1
law 1
and 0
no 0
below 0
it 1
nor 0
do 0
we 1
ask 0
any 0
man's 0
permission 0
when 0
require 0
him 0
to 1
obey 0
Obedience 0
demanded 0
as 1
a 1
right 0
not 0
asked 0
favor 0
Theodore 0
Roosevelt 0
December 0
3 0
1907 0
==8528==
==8528== HEAP SUMMARY:
==8528== in use at exit: 0 bytes in 0 blocks
==8528== total heap usage: 73 allocs, 73 frees, 6,693 bytes allocated
==8528==
==8528== All heap blocks were freed -- no leaks are possible
==8528==
==8528== For counts of detected and suppressed errors, rerun with: -v
始终确认已释放已分配的所有内存,并且没有内存错误。
您已经有了解决当前问题的方法,但是在项目中继续考虑上面的一些提示,以消除列表的多次遍历,以及围绕strdup()
的可移植性(和验证)问题。祝你的编码好运。