pthread在Windows中比串行快,但在Linux中比串行慢



我正在尝试运行相同的并行代码,以在具有相同线程数的Windows和Linux上使用蒙特卡洛算法(4个线程,4个CPU)运行相同的C++并行代码来计算pi。虽然并行代码比Windows上的串行实现快,但在Linux上要慢得多。

这是程序:

#include <iostream>
#include <cstdlib>
#include <ctime>
#include <cmath>
#include <pthread.h>
#include <chrono>
using namespace std;
using ns = chrono::nanoseconds;
using get_time = chrono::steady_clock;
static int thread_count = 4;
pthread_mutex_t myMutex;
struct args{
int id;
int random_count;
double *pi;
};
double compute_pi(long n)
{
double pi = 0;
double x, y;
for(long i=0; i<n; i++){
x = -1 + 2 * double(rand())/RAND_MAX;
y = -1 + 2 * double(rand())/RAND_MAX;
if (sqrt(x*x + y*y) <= 1.0) pi++;
}
return 4*pi/n;
}
void* threadFunc(void *argin){
args *inputs = (args*) argin;
double my_sum = 0;
double x, y;
for(int i=0; i<inputs->random_count; i++){
x = -1 + 2 * double(rand())/RAND_MAX;
y = -1 + 2 * double(rand())/RAND_MAX;
if (sqrt(x*x + y*y) <= 1.0) my_sum++;
}
pthread_mutex_lock(&myMutex);
*(inputs->pi) += my_sum;
pthread_mutex_unlock(&myMutex);
return nullptr;
}
double compute_pi_parallel(long n)
{
double pi = 0;
int count_per_thread = n/thread_count;
pthread_t *threads = new pthread_t[thread_count];
args *funcInputs = new args[thread_count];
pthread_mutex_init(&myMutex, nullptr);
for(int i=0; i<thread_count; i++){
funcInputs[i].id = i;
funcInputs[i].random_count = i<n%thread_count ? count_per_thread+1 : 
count_per_thread;
funcInputs[i].pi = &pi;
int rc = pthread_create(&threads[i], nullptr, threadFunc, (void *) 
&funcInputs[i]);
if(rc) cerr << "error in thread creation!n";
}
for(int i=0; i<thread_count; i++){
int rc = pthread_join(threads[i], nullptr);
if(rc) cerr << "Error in thread join!n";
}
pthread_mutex_destroy(&myMutex);
delete [] funcInputs;
delete [] threads;
return 4*pi/n;
}
int main(int argc, char* argv[])
{
srand(time(nullptr));
long n = 100000000;
auto start = get_time::now();
if (argc > 1){
n = atol(argv[1]);
if (argc == 3){
thread_count = atoi(argv[2]);
cout << "pi(parallel) = " << compute_pi_parallel(n) << endl;
auto stop = get_time::now();
auto diff = stop - start;
cout<<"Elapsed time is :  "<< chrono::duration_cast<ns>
(diff).count()/1e9<<" s "<<endl;
return 0;
}
}
cout << "pi = " << compute_pi(n) << endl;
auto stop_s = get_time::now();
auto diff_s = stop_s - start;
cout << "pi(parallel) = " << compute_pi_parallel(n) << endl;
auto stop_p = get_time::now();
auto diff = stop_p - stop_s;
cout<<"Elapsed time for serial is :  "<< chrono::duration_cast<ns>
(diff_s).count()/1e9<<" s "<<endl;
cout<<"Number of threads: "<< thread_count<< endl;
cout<<"Elapsed time for parallel is :  "<< chrono::duration_cast<ns>
(diff).count()/1e9<<" s "<<endl;
return 0;
}

视窗上的输出:

pi = 3.14146
pi(parallel) = 3.14087
Elapsed time for serial is :  6.16426 s
Number of threads: 4
Elapsed time for parallel is :  1.0659 s

在 Linux 上:$g++ -std=c++11 -g -Wall -o mc mc.cpp -lpthread

输出:

pi = 3.14138
pi(parallel) = 3.14166
Elapsed time for serial is :  3.10837 s 
Number of threads: 4
Elapsed time for parallel is :  19.8226 s

我用 $lscpu检查了 Linux 上的 CPU 数量,并使用 $top监控了 CPU 使用率,似乎 Linux 正在使用所有可用的内核,但它仍然比串行代码慢。我正在Windows虚拟机上的Ubuntu 16.04 LTS上运行该程序。

我想知道我在 Linux 上是否做错了什么。

您正在使用 rand。如果 rand 是线程安全的,则它是定义的实现。它可能只是调用互斥锁。使用没有全局状态的新式C++随机数生成器。

相关内容

最新更新