使用 std::向量分配的类比 LOT 的指针分配慢

我有一个处理数组分配的类。我的类很简单，定义如下：

DimArray.hpp：

#ifndef DIMARRAY_HPP_INCLUDED
#define DIMARRAY_HPP_INCLUDED
#include <vector>
template<typename T>
class DimArray
{
    private:
        int Width, Height;
        std::vector<T> Data;
    public:
        DimArray(int Width, int Height);
        DimArray(T* Data, int Width, int Height);
        DimArray(T** Data, int Width, int Height);
        DimArray(const DimArray &da);
        DimArray(DimArray &&da);
        inline int size() {return Width * Height;}
        inline int size() const {return Width * Height;}
        inline int width() {return Width;}
        inline int width() const {return Width;}
        inline int height() {return Height;}
        inline int height() const {return Height;}
        inline T* operator [](int Index) {return const_cast<T*>(Data.data()) + Height * Index;}
        inline const T* operator [](int Index) const {return Data.data() + Height * Index;}
        DimArray& operator = (DimArray da);
};
template<typename T>
DimArray<T>::DimArray(int Width, int Height) : Width(Width), Height(Height), Data(Width * Height, 0) {}
template<typename T>
DimArray<T>::DimArray(T* Data, int Width, int Height) : Width(Width), Height(Height), Data(Width * Height, 0) {std::copy(Data, Data + Width * Height, const_cast<T*>(this->Data.data()));}
template<typename T>
DimArray<T>::DimArray(T** Data, int Width, int Height) : Width(Width), Height(Height), Data(Width * Height, 0) {std::copy(Data[0], Data[0] + Width * Height, const_cast<T*>(this->Data.data()));}
template<typename T>
DimArray<T>::DimArray(const DimArray &da) : Width(da.Width), Height(da.Height), Data(da.Data) {}
template<typename T>
DimArray<T>::DimArray(DimArray &&da) : Width(std::move(da.Width)), Height(std::move(da.Height)), Data(std::move(da.Data)) {}
template<typename T>
DimArray<T>& DimArray<T>::operator = (DimArray<T> da)
{
    this->Width = da.Width;
    this->Height = da.Height;
    this->Data.swap(da.Data);
    return *this;
}
#endif // DIMARRAY_HPP_INCLUDED

对于基准计时，我使用以下方法：

计时器.hpp：

#ifndef TIME_HPP_INCLUDED
#define TIME_HPP_INCLUDED
#include <chrono>
#if defined _WIN32 || defined _WIN64
#include <windows.h>
template<typename T>
class Timer
{
    private:
        typedef T duration;
        typedef typename T::rep rep;
        typedef typename T::period period;
        typedef std::chrono::time_point<Timer, duration> time_point;
        std::chrono::time_point<Timer, duration> Time;
        static const bool is_steady = true;
        const rep g_Frequency = []() -> rep
        {
            LARGE_INTEGER frequency;
            QueryPerformanceFrequency(&frequency);
            return frequency.QuadPart;
        }();
        inline std::chrono::time_point<Timer, duration> now()
        {
            LARGE_INTEGER count;
            QueryPerformanceCounter(&count);
            return time_point(duration(count.QuadPart * static_cast<rep>(period::den) / g_Frequency));
        }
    public:
        inline void Start() {this->Time = this->now();}
        inline rep End() {return std::chrono::duration_cast<T>(this->now() - this->Time).count();}
};
#else
template<typename T>
class Timer
{
    private:
        static const bool is_steady = true;
        std::chrono::high_resolution_clock Clock;
        std::chrono::time_point<std::chrono::high_resolution_clock> Time;
    public:
        inline void Start() {this->Time = this->Clock.now();}
        inline T::rep End() {return std::chrono::duration_cast<T>(this->Clock.now() - this->Time).count();}
};
#endif
#endif // TIME_HPP_INCLUDED

我的基准如下：

int main()
{
    Timer<std::chrono::nanoseconds> T;
    T.Start();
    for (int i = 0; i < 100; ++i)
    {
        int** T2DArray = new int*[10000];
        for (int i = 0; i < 10000; ++i)
        {
            T2DArray[i] = new int[10000];
        }
        for (int i = 0; i < 10000; ++i)
        {
            delete[] T2DArray[i];
        }
        delete[] T2DArray;
    }
    std::cout<<T.End()<<" usnn";
    T.Start();
    for (int i = 0; i < 100; ++i)
    {
        DimArray<int> TwoDArray(10000, 10000);
    }
    std::cout<<T.End()<<" usnn";
}

它打印的结果是：

4.9599256 seconds  //for int**
42.9303941 seconds //for DimArray<int>

这是一个巨大的差异！我想不通为什么？！

所以我把它改成：

int main()
{
    Timer<std::chrono::nanoseconds> T;
    T.Start();
    for (int i = 0; i < 100; ++i)
    {
        int** T2DArray = new int*[10000];
        for (int i = 0; i < 10000; ++i)
        {
            T2DArray[i] = new int[10000];
        }
        for (int i = 0; i < 10000; ++i)
        {
            delete[] T2DArray[i];
        }
        delete[] T2DArray;
    }
    std::cout<<T.End()<<" usnn";
    T.Start();
    for (int i = 0; i < 100; ++i)
    {
        int* TwoDArray = new int[10000 * 10000];
        delete[] TwoDArray;
    }
    std::cout<<T.End()<<" usnn";
}

结果是：

4.6085725 seconds //for int**
0.1142958 seconds //for int*

知道为什么我的类使用 std::vector 与使用原始指针相比如此慢吗？

vector将为您分配的内存清零。您的代码与new为您提供"垃圾初始化"的记忆。因此，分配TwoDArray(10000, 10000)给你一个充满零的数组，而new int[10000 * 10000]给你一个未定义内容的数组。（仅仅观察会导致未定义的行为）

请注意，这意味着在向量情况下，程序实际上写入所有100000000 int，而在new情况下，程序只为这么多int留出地址空间。

对于类似的测量，您必须首先将新的阵列归零; 例如，使用int* TwoDArray = new int[10000 * 10000]();而不是int* TwoDArray = new int[10000 * 10000];。

为了说明Billy ONeal的建议，以及戏剧性的结果，我这样做了：

template <class T>
class no_init_alloc
    : public std::allocator<T>
{
public:
    using std::allocator<T>::allocator;
    template <class U, class... Args> void construct(U*, Args&&...) {}
};
template<typename T>
class DimArray
{
    private:
        int Width, Height;
        std::vector<T, no_init_alloc<T>> Data;
    public:
        …

我的结果从：

2959854464 us
28734347029 us

自：

2980402236 us
850190 us

我从这个除夕派对上有点晚了。

除了提供自定义分配器外，还可以调用reserve()，然后在实际数据可用时调用push_back()/emplace_back()。从未定义行为的角度来看，它也是安全的：您只能访问已经初始化的数据。遗憾的是，此方法可能不适合您的应用程序。

人们必须非常小心这种基准测试。例如，Linux 执行延迟分配;如果我运行下面的代码，我会得到以下内容：

$/usr/bin/time --verbose ./a.out
int[]： 0 ms
矢量： 0 ms
...
最大驻留集大小（KB）：1004

如果我取消注释中的任何代码，它将变为 391 MB。

#include <chrono>
#include <cstdio>
#include <vector>
using namespace std;
using namespace std::chrono;
constexpr size_t SIZE = 100000000;
int main(int argc, char* argv[]) {
  auto t1 = high_resolution_clock::now();
  {
    int* pInt = new int[SIZE];
    //int* pInt = new int[SIZE]();
    pInt[0] = 0;
    delete[] pInt;
  }
  auto t2 = high_resolution_clock::now();  
  {
    vector<int> v;
    v.reserve(SIZE);
    v.push_back(0);
    //v.resize(SIZE, 0);
  }
  auto t3 = high_resolution_clock::now();
  printf("int[]:  %ld  msn", duration_cast<milliseconds>(t2-t1).count());
  printf("vector: %ld  msn", duration_cast<milliseconds>(t3-t2).count());
}

相关内容

最新更新

热门标签：