我有这个方法:(它应该从给定目录中的所有文件创建一个倒排列表(
class Index{
public:
Index();
void create();
void writeInvertedIndex();
private:
bool isWhiteSpace(const char ch);
std::map<std::string, std::set<int>> invertedIndex;
};
void Index::create(){
std::string datasetPath = "/home/skluzada/Downloads/BI-VWM/Project/dataset/";
std::string filePath, word, text;
std::ifstream infile;
int fileIndex = 0;
std::size_t textLen, i;
DIR * dir;
struct dirent * ent;
if ((dir = opendir (datasetPath.c_str())) != NULL){
while((ent = readdir(dir)) != NULL){
filePath = datasetPath + ent->d_name;
std::cout << filePath << std::endl;
std::ifstream inFile(filePath, std::ios::in);
std::stringstream buffer;
buffer << inFile.rdbuf();
std::string text = buffer.str();
inFile.close();
textLen = text.size();
i = 19;
while (i < textLen){
word = "";
while(isWhiteSpace(text[i])){
i++;
}
while(!isWhiteSpace(text[i])){
word = word + text[i];
i++;
}
invertedIndex[word].insert(fileIndex);
}
fileIndex++;
}
}
}
当我在一小群文件(大约 50Kb 的文本(上运行该程序时,它工作正常,但是当我在实际集合(总共 500 个文件约 1.5Mb(上运行它时,它在处理大约 50 个文件后出现段错误。输出倒排索引约为900Kb。
我想将整个文件加载到字符串中不是一个理想的解决方案(每个文件为 1-5Kb(,但我在项目的其他部分使用了类似的方法,即使在整个集合上也能正常工作。
你能建议问题出在哪里吗?或者也许给我一些关于优化的建议?
最奇怪的是,当我使用 Valgrind 运行它时,它会处理整个集合而不会出现段错误。这是瓦尔格林德输出:
==9952== Syscall param writev(vector[...]) points to uninitialised byte(s)
==9952== at 0x57F6610: __writev_nocancel (in /usr/lib64/libc-2.25.so)
==9952== by 0x4EEC4B1: std::__basic_file<char>::xsputn_2(char const*, long, char const*, long) (in /usr/lib64/libstdc++.so.6.0.24)
==9952== by 0x4F29BC1: std::basic_filebuf<char, std::char_traits<char> >::xsputn(char const*, long) (in /usr/lib64/libstdc++.so.6.0.24)
==9952== by 0x4F4E063: std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long) (in /usr/lib64/libstdc++.so.6.0.24)
==9952== by 0x401BFC: Index::writeInvertedIndex() (in /home/skluzada/Downloads/BI-VWM/Project/index)
==9952== by 0x4021D0: main (in /home/skluzada/Downloads/BI-VWM/Project/index)
==9952== Address 0x6f929f4 is 84 bytes inside a block of size 2,273 alloc'd
==9952== at 0x4C2E1CA: operator new(unsigned long) (vg_replace_malloc.c:334)
==9952== by 0x4F62144: void std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_construct<char*>(char*, char*, std::forward_iterator_tag) (in /usr/lib64/libstdc++.so.6.0.24)
==9952== by 0x4F6219E: std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) (in /usr/lib64/libstdc++.so.6.0.24)
==9952== by 0x404A5F: std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::set<int, std::less<int>, std::allocator<int> > >::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, 0ul>(std::tuple<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&>&, std::tuple<>&, std::_Index_tuple<0ul>, std::_Index_tuple<>) (in /home/skluzada/Downloads/BI-VWM/Project/index)
==9952== by 0x404782: std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::set<int, std::less<int>, std::allocator<int> > >::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&>(std::piecewise_construct_t, std::tuple<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&>, std::tuple<>) (in /home/skluzada/Downloads/BI-VWM/Project/index)
==9952== by 0x40458C: void __gnu_cxx::new_allocator<std::_Rb_tree_node<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::set<int, std::less<int>, std::allocator<int> > > > >::construct<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::set<int, std::less<int>, std::allocator<int> > >, std::piecewise_construct_t const&, std::tuple<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&>, std::tuple<> >(std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::set<int, std::less<int>, std::allocator<int> > >*, std::piecewise_construct_t const&, std::tuple<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&>&&, std::tuple<>&&) (in /home/skluzada/Downloads/BI-VWM/Project/index)
==9952== by 0x404247: void std::allocator_traits<std::allocator<std::_Rb_tree_node<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::set<int, std::less<int>, std::allocator<int> > > > > >::construct<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::set<int, std::less<int>, std::allocator<int> > >, std::piecewise_construct_t const&, std::tuple<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&>, std::tuple<> >(std::allocator<std::_Rb_tree_node<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::set<int, std::less<int>, std::allocator<int> > > > >&, std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::set<int, std::less<int>, std::allocator<int> > >*, std::piecewise_construct_t const&, std::tuple<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&>&&, std::tuple<>&&) (in /home/skluzada/Downloads/BI-VWM/Project/index)
==9952== by 0x403C6C: void std::_Rb_tree<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::set<int, std::less<int>, std::allocator<int> > >, std::_Select1st<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::set<int, std::less<int>, std::allocator<int> > > >, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::set<int, std::less<int>, std::allocator<int> > > > >::_M_construct_node<std::piecewise_construct_t const&, std::tuple<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&>, std::tuple<> >(std::_Rb_tree_node<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::set<int, std::less<int>, std::allocator<int> > > >*, std::piecewise_construct_t const&, std::tuple<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&>&&, std::tuple<>&&) (in /home/skluzada/Downloads/BI-VWM/Project/index)
==9952== by 0x403069: std::_Rb_tree_node<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::set<int, std::less<int>, std::allocator<int> > > >* std::_Rb_tree<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::set<int, std::less<int>, std::allocator<int> > >, std::_Select1st<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::set<int, std::less<int>, std::allocator<int> > > >, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::set<int, std::less<int>, std::allocator<int> > > > >::_M_create_node<std::piecewise_construct_t const&, std::tuple<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&>, std::tuple<> >(std::piecewise_construct_t const&, std::tuple<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&>&&, std::tuple<>&&) (in /home/skluzada/Downloads/BI-VWM/Project/index)
==9952== by 0x402C4D: std::_Rb_tree_iterator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::set<int, std::less<int>, std::allocator<int> > > > std::_Rb_tree<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::set<int, std::less<int>, std::allocator<int> > >, std::_Select1st<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::set<int, std::less<int>, std::allocator<int> > > >, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::set<int, std::less<int>, std::allocator<int> > > > >::_M_emplace_hint_unique<std::piecewise_construct_t const&, std::tuple<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&>, std::tuple<> >(std::_Rb_tree_const_iterator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::set<int, std::less<int>, std::allocator<int> > > >, std::piecewise_construct_t const&, std::tuple<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&>&&, std::tuple<>&&) (in /home/skluzada/Downloads/BI-VWM/Project/index)
==9952== by 0x4028A4: std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::set<int, std::less<int>, std::allocator<int> >, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::set<int, std::less<int>, std::allocator<int> > > > >::operator[](std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) (in /home/skluzada/Downloads/BI-VWM/Project/index)
==9952== by 0x40203A: Index::create() (in /home/skluzada/Downloads/BI-VWM/Project/index)
==9952==
==9952== Syscall param writev(vector[...]) points to uninitialised byte(s)
==9952== at 0x57F6610: __writev_nocancel (in /usr/lib64/libc-2.25.so)
==9952== by 0x4EEC4B1: std::__basic_file<char>::xsputn_2(char const*, long, char const*, long) (in /usr/lib64/libstdc++.so.6.0.24)
==9952== by 0x4F29BC1: std::basic_filebuf<char, std::char_traits<char> >::xsputn(char const*, long) (in /usr/lib64/libstdc++.so.6.0.24)
==9952== by 0x4F42581: std::ostreambuf_iterator<char, std::char_traits<char> > std::num_put<char, std::ostreambuf_iterator<char, std::char_traits<char> > >::_M_insert_int<long>(std::ostreambuf_iterator<char, std::char_traits<char> >, std::ios_base&, char, long) const (in /usr/lib64/libstdc++.so.6.0.24)
==9952== by 0x4F4E564: std::ostream& std::ostream::_M_insert<long>(long) (in /usr/lib64/libstdc++.so.6.0.24)
==9952== by 0x401C82: Index::writeInvertedIndex() (in /home/skluzada/Downloads/BI-VWM/Project/index)
==9952== by 0x4021D0: main (in /home/skluzada/Downloads/BI-VWM/Project/index)
==9952== Address 0x5d0f861 is 721 bytes inside a block of size 8,192 alloc'd
==9952== at 0x4C2E8B7: operator new[](unsigned long) (vg_replace_malloc.c:423)
==9952== by 0x4F2AA87: std::basic_filebuf<char, std::char_traits<char> >::_M_allocate_internal_buffer() (in /usr/lib64/libstdc++.so.6.0.24)
==9952== by 0x4F2EC71: std::basic_filebuf<char, std::char_traits<char> >::open(char const*, std::_Ios_Openmode) (in /usr/lib64/libstdc++.so.6.0.24)
==9952== by 0x4F2ED92: std::basic_ofstream<char, std::char_traits<char> >::open(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::_Ios_Openmode) (in /usr/lib64/libstdc++.so.6.0.24)
==9952== by 0x401B75: Index::writeInvertedIndex() (in /home/skluzada/Downloads/BI-VWM/Project/index)
==9952== by 0x4021D0: main (in /home/skluzada/Downloads/BI-VWM/Project/index)
==9952==
==9952==
==9952== HEAP SUMMARY:
==9952== in use at exit: 32,816 bytes in 1 blocks
==9952== total heap usage: 820,861 allocs, 820,860 frees, 16,188,505,659 bytes allocated
==9952==
==9952== LEAK SUMMARY:
==9952== definitely lost: 32,816 bytes in 1 blocks
==9952== indirectly lost: 0 bytes in 0 blocks
==9952== possibly lost: 0 bytes in 0 blocks
==9952== still reachable: 0 bytes in 0 blocks
==9952== suppressed: 0 bytes in 0 blocks
==9952== Rerun with --leak-check=full to see details of leaked memory
==9952==
==9952== For counts of detected and suppressed errors, rerun with: -v
==9952== Use --track-origins=yes to see where uninitialised values come from
==9952== ERROR SUMMARY: 681764 errors from 10 contexts (suppressed: 0 from 0)
所问问题的解决方案是 rafix07 的评论:
你为什么不在
while(isWhiteSpace(text[i])){i++;}
和下一个while循环中检查i < textLen
,你可能会读取超出范围的数据。更改为while (i < textLen && isWhiteSpace[text[i]]){i++;}
并在第二个循环中执行相同的操作。