我有一个3D矩阵mat[100][100][100]
。找到mat[0][][], mat[1][][],....,mat[99][][]
中出现的具有相同元素的行的有效方法是什么?一个简单的方法是将mat[0][][]
的每一行与其余99个矩阵的所有行进行比较,但这不是很有效(我想)。有更好的方法吗?
要扩展@chux的注释,第一步是为每个矩阵的每一行计算一个哈希值。总共有10000个哈希值。结果应该存储在一个由10000个structs组成的数组中。
struct info
{
int m; // the matrix number
int row; // the row number
uint32_t hash; // the hash value for mat[m][row]
};
static struct info hashArray[10000];
在填充完hashArray
的所有10000个条目后,按哈希值对数组进行排序。然后,您可以简单地扫描数组以查找任何重复的哈希值。当您确实发现重复时,您需要通过比较行元素来确认。
我终于找到了一些时间来编写内容寻址代码。事实证明,它比使用哈希表要快得多。但问题是,代码要复杂得多,程序占用的内存也要多得多。我的最后一个观点是,除非你真的需要额外的速度,否则坚持使用哈希表。
下面给出了一些测试运行的示例。程序的参数指定唯一的行数。程序用随机选择的现有行填充其余行。然后,行被打乱。该程序查找所有重复的行,并报告重复行的数量以及哈希表和内容可寻址表所花费的时间。
bing@mint5 ~ $ cc -O2 cattest.c -o cattest
bing@mint5 ~ $ ./cattest 500
CAT Test 9500 0.0083
Hash Test 9500 0.0499
bing@mint5 ~ $ ./cattest 5000
CAT Test 5000 0.0195
Hash Test 5000 0.1477
bing@mint5 ~ $ ./cattest 9000
CAT Test 1000 0.0321
Hash Test 1000 0.1092
/* content addressable table vs hash table */
/* written by Bing H Bang */
/* I DONOT give permission to any snot-nosed students to copy my work and turn it in
as their own */
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <time.h>
#include <pthread.h>
#include <errno.h>
#include <string.h>
#include <sys/time.h>
#include <sys/sysinfo.h>
double etime()
{
struct timeval tv;
double dt, df;
gettimeofday(&tv, NULL);
dt = (double)(tv.tv_sec);
df = ((double)(tv.tv_usec))/1000000.0;
return(dt+df);
}
struct CAT_entry
{
unsigned fval;
unsigned short rows[10000];
unsigned short num;
unsigned short used;
struct CAT_entry *next;
} *CAT[256] = {NULL};
struct CAT_entry stmem[10000];
int stidx = 0;
unsigned dat[100][10000];
char map[10000];
unsigned hasht[10000];
#define highbit (1 << ((sizeof(unsigned)*8)-1))
unsigned
rotxor(unsigned sum, unsigned v)
{
if((sum & highbit) == 0)
return ((sum << 1) ^ v);
else
return (((sum << 1) | 1) ^ v);
}
unsigned
compute_hash(int y)
{
int x;
unsigned sum = 0;
for(x = 0; x < 100; ++x)
sum = rotxor(sum, dat[x][y]);
return sum;
}
void
mk_hasht()
{
int y;
for(y = 0; y < 10000; ++y)
hasht[y] = compute_hash(y);
}
clearmap()
{
memset((void *)map, 0, 10000);
}
comprow(int y, int yd)
{
int x;
for(x = 0; x < 100; ++x)
if(dat[x][y] != dat[x][yd])
return 0;
return 1;
}
struct CAT_entry **
srch_CAT(unsigned value)
{
struct CAT_entry **p = &(CAT[value&255]);
static struct CAT_entry *r = NULL;
while(*p != NULL)
{
if((*p)->fval == value)
break;
if((*p)->fval > value)
return &r;
else
p = &((*p)->next);
}
return p;
}
void
add_entry(int y, unsigned value)
{
struct CAT_entry **p = &(CAT[value&255]), *q;
while(*p != NULL)
{
q = *p;
if(q->fval == value)
{
q->rows[q->num] = y;
q->num++;
return;
}
if(q->fval > value)
break;
p = &(q->next);
}
q = *p;
//*p = malloc(sizeof(struct CAT_entry));
*p = &stmem[stidx++];
(*p)->next = q;
q = *p;
q->fval = value;
q->num = 0;
q->used = 0;
}
void
mk_CAT()
{
int x,y;
struct CAT_entry **p, *q;
for(y = 0; y < 10000; y++)
add_entry(y, dat[0][y]);
for(x=0; x < 256; ++x)
{
p = &(CAT[x]);
while(*p != NULL)
{
q = *p;
if(q->num == 0)
{
*p = q->next;
//free(q);
}
else
p = &(q->next);
}
}
}
void
gen_data(int npat)
{
int x, y, rnum, limit;
unsigned r;
srandom(time(NULL));
rnum = npat * 100;
for(y = 0; y < rnum; ++y)
dat[y%100][y/100] = random();
for(y = npat; y < 10000; ++y)
{
rnum = random() % npat;
for(x = 0; x < 100; ++x)
dat[x][y]=dat[x][rnum];
}
for(y = 0; y < 10000; ++y)
{
rnum = random() % 10000;
if(rnum == y)
continue;
for(x = 0; x < 100; ++x)
{
r = dat[x][y];
dat[x][y]=dat[x][rnum];
dat[x][rnum] = r;
}
}
}
int
do_CATtest()
{
int y, yd, count = 0, i;
struct CAT_entry **p, *q;
mk_CAT();
clearmap();
for(y = 0; y < 9999; ++y)
{
if(map[y] == 0)
{
map[y] = 1;
if(*(p = srch_CAT(dat[0][y])) != NULL)
{
for(q = *p, i = 0; i < q->num; ++i)
{
yd = q->rows[i];
if(map[yd] == 0)
{
if(comprow(y, yd))
{
map[yd] = 1;
++count;
q->used++;
}
}
}
if(q->num <= q->used)
*p = q->next;
}
}
}
return count;
}
int
do_hashtest()
{
unsigned h;
int x, y, yd, count = 0;
mk_hasht();
clearmap();
for(y = 0; y < 9999; ++y)
{
if(map[y] != 0)
continue;
map[y] = 1;
h = hasht[y];
for(yd = y+1; yd < 10000; ++yd)
{
if(map[yd] != 0)
continue;
if(h == hasht[yd])
if(comprow(y, yd))
{
map[yd] = 1;
++count;
}
}
}
return count;
}
main(int c, char *v[])
{
int npat = 0, count;
double t1, t2;
if(c == 2)
npat = atoi(v[1]);
if(npat <= 0 || npat >= 10000)
{
puts("input param error");
exit(1);
}
gen_data(npat);
npat = 10000 - npat;
t1 = etime();
if((count = do_CATtest()) != npat)
{
printf("CAT test error, %d matches found, not %d", count, npat);
exit(1);
}
t2 = etime();
printf("CAT Test %d %.4fn", npat, t2-t1);
t1 = etime();
if((count = do_hashtest()) != npat)
{
printf("hash test error, %d matches found, not %d", count, npat);
exit(1);
}
t2 = etime();
printf("Hash Test %d %.4fn", npat, t2-t1);
}
为每行中的第一个值创建一个内容可寻址表。然后遍历每一行,取第一个值并在表上查找。如果查找返回多行,则应检查这些行是否匹配。应该记住搜索的行以提高效率,因为不需要再次检查检查的行。您将得到一个相同行分组的列表。