我试图读取一个文本文件,我做了一个链表,文本文件看起来像这样:
around 1 2 1
bread 2 4 3 5 1
four 1 3 2
head 3 1 2 2 1 5 1
has 2 3 1 5 2
其中每行的第一个字符串只是段落中的单词。单词后面的第一个数字是该单词在段落中出现的行数。下面的数字是段落中的成对(行,出现次数)。
例如,对于单词bread
:
在段落的2
行中发现。在第一行4
中,发现了3
次。然后在第二行5
行,它被发现是1
时间。
我正在尝试从这个文本文件创建一个链表,我的程序看起来像这样到目前为止:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <ctype.h>
#define MAXWORD 999
typedef struct node node_t;
struct node {
char *word;
int num_lines;
int paragraph;
int freq;
node_t *next;
};
int
main(int argc, char *argv[]) {
FILE *fp;
char word[MAXWORD+1];
int ch, line_count = 0, len = 0;
node_t *node = (node_t*)malloc(sizeof(*node));
node_t *curr, *prev;
fp = fopen(argv[1], "r");
if (fp == NULL) {
fprintf(stderr, "Error reading filen");
exit(EXIT_FAILURE);
}
/* Just trying to store the string so far */
while ((ch = getc(fp)) != EOF) {
if (ch == 'n') {
line_count++;
strcpy(node->word, word);
}
if (isalpha(ch)) {
word[len] = ch;
len++;
word[len] = ' ';
}
if (isdigit(ch)) {
len = 0;
}
}
printf("line count = %d", line_count);
free(node)
fclose(fp);
return 0;
}
在这个代码片段中,我一直在尝试将字符串存储在链表数据结构中,但是我还没有使用动态数组来存储文本文件中出现的单词后面的数字。我知道我需要使用malloc()
和realloc()
构建此数据结构,但我不确定如何做到这一点。
我该怎么做?
我想要的输出是这样的:
There are five words in the text file,
and 9 pairs of (line, occurences)
Word: pairs
"around": 2,1
"bread": 4,3; 5,1
"four": 3,2
"head": 1,2; 2,1; 5,1
"has": 3,1; 5,2
我一直在研究这个问题,它似乎与倒排索引问题非常相似,我已经看到使用二叉搜索树将是最好的。
我可以这样实现我的二叉搜索树吗?
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <ctype.h>
#define MAXWORD 999
typedef char word_t[MAXWORD+1];
typedef struct node node_t;
struct node {
void *data;
int *ints;
node_t *rght;
node_t *left;
};
typedef struct {
node_t *root;
int (*cmp)(void*, void*);
} tree_t;
int
main(int argc, char *argv[]) {
FILE *fp;
fp = fopen(argv[1], "r");
if (fp == NULL) {
fprintf(stderr, "Error reading filen");
exit(EXIT_FAILURE);
}
while ((ch = getc(fp)) != EOF) {
if (ch == 'n') {
line_count++;
}
}
fclose(fp);
return 0;
}
你可以这样做:
typedef struct {
int paragraph;
int freq;
} stats_t;
struct node {
char *word;
int num_lines;
stats_t *stats;
node_t *next;
};
然后在你解析字符串之后,你可以做:
ps = calloc(line_count, sizeof(stats_t));
获取指向stats_t
结构体数组的指针,您可以用行位置和频率填充该数组。然后可以将指针ps
存储在node
结构体中。
我写了一个我认为你正在寻找的程序。我修改了我之前正在考虑的结构:
typedef node node_t;
struct node {
char *word;
int num_lines;
int *location;
int *frequency;
node_t *next;
};
这样,节点包含指向int
数组的指针,用于存储位置和频率信息。用于字串、位置数组和频率数组的节点和存储都是动态分配的。下面是代码:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#define MAXLINE 1000
#define MAXWORD 30
typedef struct node node_t;
struct node {
char *word;
int num_lines;
int *location;
int *frequency;
node_t *next;
};
void strip(char *pln);
void normalize_word(char *pstr);
struct node * update_word(char *pwd, int lnum, struct node *phead);
struct node * find_in_list(char *pwd, struct node *phead);
int find_line_pair(int lnum, struct node *pwn);
int list_len(struct node *phead);
int num_pairs(struct node *phead);
int main(int argc, char *argv[])
{
FILE *fp;
struct node *head, *current;
char *pline, *pword;
char line[MAXLINE + 1];
char word[MAXWORD + 1];
int i, n, line_count = 0;
head = NULL;
if (argc < 2) {
fprintf(stderr, "Usage: %s filenamen", argv[0]);
exit(EXIT_FAILURE);
} else {
if ((fp = fopen(argv[1], "r")) == NULL) {
fprintf(stderr, "Unable to open file %sn", argv[1]);
exit(EXIT_FAILURE);
}
}
/* Read in lines and process words */
pline = line;
pword = word;
while (fgets(pline, MAXLINE, fp) != NULL) {
++line_count;
strip(pline);
while ((pword = strtok(pline, " ")) != NULL) {
normalize_word(pword);
if (*pword != ' ') // don't add empty words
head = update_word(pword, line_count, head);
pline = NULL;
}
pline = line;
}
/* Display list contents */
printf("There are %d words in the text file,n",
list_len(head));
printf("and %d pairs of (line, occurrences)n",
num_pairs(head));
printf("Word: pairsn");
current = head;
while (current != NULL) {
n = current->num_lines;
printf("%s:", current->word);
for (i = 0; i < n; i++) {
printf(" %d, %d;",
current->location[i], current->frequency[i]);
}
putchar('n');
current = current->next;
}
/* Cleanup */
// close file
if (fclose(fp) != 0)
fprintf(stderr, "Error closing file %sn", argv[1]);
// free all allocated memory
current = head;
while (current != NULL) {
free(current->word);
free(current->location);
free(current->frequency);
current = current->next;
free(head);
head = current;
}
return 0;
}
/* Remove trailing newlines */
void strip(char *pln)
{
while (*pln != ' ') {
if (*pln == 'n')
*pln = ' ';
++pln;
}
}
/* Convert word to lowercase and remove trailing
* non-alphanumeric characters */
void normalize_word(char *pstr)
{
int i = 0;
char ch;
while ((ch = pstr[i]) != ' ') {
pstr[i] = tolower(ch);
++i;
}
while ((--i >= 0) && !isalnum(pstr[i])) {
pstr[i] = ' ';
continue;
}
}
/* Update existing word node or create a new one, and return
* a pointer to the head of the list */
struct node * update_word(char *pwd, int lnum, struct node *phead)
{
struct node *found, *newnode;
char *pword;
int *ploc, *pfreq;
int index;
/* Modify existing node if word is in list */
if ((found = find_in_list(pwd, phead)) != NULL) {
// add new (location, freq) pair if word not in found line
if ((index = find_line_pair(lnum, found)) == -1) {
index = found->num_lines; // index for new pair
found->num_lines += 1; // increment number of lines
ploc = realloc(found->location, (index + 1) * sizeof(int));
pfreq = realloc(found->frequency, (index + 1) * sizeof(int));
ploc[index] = lnum; // new location
pfreq[index] = 1; // found once in this line so far
found->location = ploc; // point to new location array
found->frequency = pfreq; // point to new frequency array
}
else { // update frequency in existing line
found->frequency[index] += 1;
}
/* Set up a new node */
} else {
// allocate memory for new node
newnode = malloc(sizeof(struct node));
// allocate memory for string pointed to from node
pword = malloc((strlen (pwd) + 1) * sizeof(char));
strcpy(pword, pwd);
newnode->word = pword; // set word pointer
newnode->num_lines = 1; // only one line so far
ploc = malloc(sizeof(int));
pfreq = malloc(sizeof(int));
*ploc = lnum; // location was passed by caller
*pfreq = 1; // only one occurrence so far
newnode->location = ploc;
newnode->frequency = pfreq;
if (phead == NULL) { // if wordlist is empty
newnode->next = NULL; // only/last link in the list
phead = newnode; // newnode is the head
} else {
newnode->next = phead; // insert newnode at front of list
phead = newnode;
}
}
return phead;
}
/* Return pointer to node containing word, or NULL */
struct node * find_in_list(char *pwd, struct node *phead)
{
struct node *current = phead;
while (current != NULL) {
if (strcmp(current->word, pwd) == 0)
return current; // word already in list
current = current->next;
}
return NULL; // word not found
}
/* Return index of existing line location, or -1 */
int find_line_pair(int lnum, struct node *pwn)
{
int n = pwn->num_lines;
int index = 0;
while (index < n) {
if (pwn->location[index] == lnum)
return index; // word already found in this line
++index;
}
return -1; // word not yet found in this line
}
/* Find number of nodes in linked list */
int list_len(struct node *phead)
{
int length = 0;
struct node *current = phead;
while (current != NULL) {
++length;
current = current->next;
}
return length;
}
/* Find number of (line, occurrence) pairs */
int num_pairs(struct node *phead)
{
int num = 0;
struct node *current = phead;
while (current != NULL) {
num += current->num_lines;
current = current->next;
}
return num;
}
注意:我从update_word()
函数的前一个版本修改了这个。原始代码在列表的末尾插入了一个新节点,因此生成的列表按照单词在输入文本中首次出现的顺序包含单词。这个版本在列表的开头插入一个新节点,因此生成的列表包含与首次出现的顺序相反的单词。这加快了节点插入的速度,并简化了节点插入代码:
current = phead;
while (current->next != NULL) // find tail
current = current->next;
current->next = newnode; // add newnode to end
:
newnode->next = phead; // insert newnode at front of list
我毫不怀疑代码可以改进,但这似乎确实有效。我不会说这很简单,但相对简单。我对这个文本文件运行它:
Three blind mice. Three blind mice.
See how they run. See how they run.
They all ran after the farmer's wife,
Who cut off their tails with a carving knife,
Did you ever see such a sight in your life,
As three blind mice?
结果如下:
There are 31 words in the text file,
and 37 pairs of (line, occurrences)
Word: pairs
as: 6, 1;
life: 5, 1;
your: 5, 1;
in: 5, 1;
sight: 5, 1;
such: 5, 1;
ever: 5, 1;
you: 5, 1;
did: 5, 1;
knife: 4, 1;
carving: 4, 1;
a: 4, 1; 5, 1;
with: 4, 1;
tails: 4, 1;
their: 4, 1;
off: 4, 1;
cut: 4, 1;
who: 4, 1;
wife: 3, 1;
farmer's: 3, 1;
the: 3, 1;
after: 3, 1;
ran: 3, 1;
all: 3, 1;
run: 2, 2;
they: 2, 2; 3, 1;
how: 2, 2;
see: 2, 2; 5, 1;
mice: 1, 2; 6, 1;
blind: 1, 2; 6, 1;
three: 1, 2; 6, 1;
这是我使用二叉搜索树(BST)的版本:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
typedef struct internal_node in_node;
struct internal_node{
int line;
int freq;
in_node* next;
};
struct tree{
char *word;
int num_lines;
in_node* in_nodeptr;
in_node* current;
struct tree* right;
struct tree* left;
};
typedef struct tree* treeptr;
void free_list(in_node* in_nodeptr){
if(in_nodeptr!=NULL) {
free(in_nodeptr);
}
}
void free_bst(treeptr head){
if (head!=NULL) {
free_bst(head->right);
free_bst(head->left);
free_list(head->in_nodeptr);
free(head->word);
free(head);
}
}
void print_list(in_node* in_nodeptr){
while(in_nodeptr!=NULL){
printf("%d %d; ",in_nodeptr->line,in_nodeptr->freq);
in_nodeptr=in_nodeptr->next;
}
}
void print_bst(treeptr head){
if(head!=NULL){
printf("%s: ",head->word);
print_list(head->in_nodeptr);
printf("n");
print_bst(head->right);
print_bst(head->left);
}
}
void input_to_bst(treeptr* head,char* word,int line){
if((*head)==NULL){
(*head)=(treeptr)malloc(sizeof(struct tree));
(*head)->word=(char*)malloc(50*sizeof(char));
strcpy(((*head)->word),word);
(*head)->num_lines=1;
(*head)->right=NULL;
(*head)->left=NULL;
(*head)->in_nodeptr=(in_node*)malloc(sizeof(in_node));
(*head)->in_nodeptr->line=line;
(*head)->in_nodeptr->freq=1;
(*head)->in_nodeptr->next=NULL;
(*head)->current=(*head)->in_nodeptr;
}
else{
int check=strcmp(((*head)->word),word);
if(check>0) input_to_bst(&((*head)->left),word,line);
else if(check<0) input_to_bst(&((*head)->right),word,line);
else{
if( (*head)->current->line==line) (*head)->current->freq++;
else {
(*head)->current->next=(in_node*)malloc(sizeof(in_node));
(*head)->current->next->line=line;
(*head)->current->next->freq=1;
(*head)->current->next->next=NULL;
}
}
}
}
int main(int argc, char *argv[]) {
treeptr head=NULL;
FILE *fp=fopen(argv[1], "r");
char word[50],ch;
int len=0,lines=1;
if (fp == NULL) {
fprintf(stderr, "Error reading filen");
exit(1);
}
while ((ch = getc(fp)) != EOF) {
if (ch == 'n') {
word[len]=' ';
if(len>0) input_to_bst(&head,word,lines);
len=0;
lines++;
}
else if (ch==' '){
word[len]=' ';
if(len>0) input_to_bst(&head,word,lines);
len=0;
}
else if (isalpha(ch)){
word[len]=ch;
len++;
}
}
if(len>0) {
word[len]=' ';
input_to_bst(&head,word,lines);
}
print_bst(head);
fclose(fp);
free_bst(head);
return 0;
}
每个单词都被保存为BST的一个节点,并且BST的每个节点除了该单词之外,都保存一个包含该单词所有出现(行和频率)的列表。为了尽可能提高效率,我们保留了一个指向表中最后一个元素的指针(in_node* current
),这样我们就不需要每次添加表时都遍历。
例如:
文本:C is an imperative procedural language. It was designed to be compiled
using a relatively straightforward compiler and to require minimal
runtime support.
输出: C: 1 1;
is: 1 1;
procedural: 1 1;
was: 1 1;
to: 1 1; 2 1;
using: 2 1;
relatively: 2 1;
straightforward: 2 1;
support: 3 1;
require: 2 1;
runtime: 3 1;
language: 1 1;
minimal: 2 1;
an: 1 1;
imperative: 1 1;
designed: 1 1;
be: 1 1;
compiled: 1 1;
compiler: 2 1;
and: 2 1;
It: 1 1;
a: 2 1;
注意上面的实现是区分大小写的,例如"And"one_answers"And"是不同的。如果您不希望区分大小写,只需将word[len]=ch;
行替换为word[len]=tolower(ch);
,即可正常工作。上面算法的复杂度是O(n^2),如果你只使用链表,复杂度是一样的,但在平均情况下,BST是O(nlogn),这比链表要好得多,这就是为什么它被认为是更好的原因。还请注意,由于我们必须为每个单词的出现保留一个列表,如果我们不保留in_node* current
指针,这将使我们在常量时间(O(1))内访问每个出现列表的末尾,那么复杂性将是最差的。所以我认为就复杂度而言你不能比0 (nlogn)更好