如何在 Erlang 中有效地实现 MapReduce 示例



我正在尝试比较并发编程语言的性能,例如Haskell,Go和Erlang。以下 Go 代码计算平方和(重复计算 R 次的平方和(:

1^2+2^2

+3^2....1024^2

package main
import "fmt"
func mapper(in chan int, out chan int) {
    for v := range in {out <- v*v}
}
func reducer(in1, in2 chan int, out chan int) {
    for i1 := range in1 {i2 := <- in2; out <- i1 + i2}
}
func main() {
    const N = 1024  // calculate sum of squares up to N; N must be power of 2
    const R = 10  // number of repetitions to fill the "pipe"
    var r [N*2]chan int
    for i := range r {r[i] = make(chan int)}
    var m [N]chan int
    for i := range m {m[i] = make(chan int)}
    for i := 0; i < N; i++ {go mapper(m[i], r[i + N])}
    for i := 1; i < N; i++ {go reducer(r[i * 2], r[i *2 + 1], r[i])}
    go func () {
        for j := 0; j < R; j++ {
            for i := 0; i < N; i++ {m[i] <- i + 1} 
        }
    } ()
    for j := 0; j < R; j++ {
        <- r[1]
    } 
}

下面的代码是 Erlang 中的 MapReduce 解决方案。我是 Erlang 的新手。 我想比较一下 Go、Haskell 和 Erlang 之间的性能。我的问题是如何优化这个 Erlang 代码。我使用 erlc -W mr.erl 编译此代码并使用 erl -noshell -s mr start -s init stop -extra 1024 1024 运行代码。是否有任何特殊的编译和执行选项可用于优化?我非常感谢您能提供的任何帮助。

-module(mr).
-export([start/0, create/2, doreduce/2, domap/1, repeat/3]).
start()->
    [Num_arg|Repeat] = init:get_plain_arguments(),
    N = list_to_integer(Num_arg),
    [R_arg|_] = Repeat,
    R = list_to_integer(R_arg),
    create(R, N).
create(R, Num) when is_integer(Num), Num > 0 ->
    Reducers = [spawn(?MODULE, doreduce, [Index, self()]) || Index <- lists:seq(1, 2*Num - 1)],
    Mappers = [spawn(?MODULE, domap, [In]) || In <- lists:seq(1, Num)],
    reducer_connect(Num-1, Reducers, self()),
    mapper_connect(Num, Num, Reducers, Mappers),
    repeat(R, Num, Mappers).
repeat(0, Num, Mappers)->
    send_message(Num, Mappers),
    receive
        {result, V}->
            %io:format("Repeat: ~p ~p ~n", [0, V])
            true
    end;
repeat(R, Num, Mappers)->
    send_message(Num, Mappers),
    receive
        {result, V}->
            %io:format("Got: ~p ~p ~n", [R, V])
            true
    end,
    repeat(R-1, Num, Mappers).
send_message(1, Mappers)->
    D = lists:nth (1, Mappers),
    D ! {mapper, 1};
send_message(Num, Mappers)->
    D = lists:nth (Num, Mappers),
    D ! {mapper, Num}, 
    send_message(Num-1, Mappers).
reducer_connect(1, RList, Root)->
    Parent = lists:nth(1, RList),
    Child1 = lists:nth(2, RList),
    Child2 = lists:nth(3, RList),
    Child1 ! {connect, Parent},
    Child2 ! {connect, Parent},
    Parent !{connect, Root};      
reducer_connect(Index, RList, Root)->
    Parent = lists:nth(Index, RList),
    Child1 = lists:nth(Index*2, RList),
    Child2 = lists:nth(Index*2+1, RList),
    Child1 ! {connect, Parent},
    Child2 ! {connect, Parent},
    reducer_connect(Index-1, RList, Root).
mapper_connect(1, Num, RList, MList)->
    R = lists:nth(Num, RList),
    M = lists:nth(1, MList),
    M ! {connect, R};
mapper_connect(Index, Num, RList, MList) when is_integer(Index), Index > 0 ->
    R = lists:nth(Num + (Index-1), RList),
    M = lists:nth(Index, MList),
    M ! {connect, R},
    mapper_connect(Index-1, Num, RList, MList).  
doreduce(Index, CurId)->
    receive
        {connect, Parent}->
            doreduce(Index, Parent, 0, 0, CurId)
    end.
doreduce(Index, To, Val1, Val2, Root)->
    receive
        {map, Val} ->
            if Index rem 2 == 0 ->    
                To ! {reduce1, Val},
                doreduce(Index, To, 0, 0, Root);
            true->
                To ! {reduce2, Val},
                doreduce(Index, To, 0, 0, Root)
            end;
        {reduce1, V1} when Val2 > 0, Val1 == 0 ->
            if Index == 1 ->% root node
                Root !{result, Val2 + V1},
                doreduce(Index, To, 0, 0, Root);
            Index rem 2 == 0 ->    
                To ! {reduce1, V1+Val2},
                doreduce(Index, To, 0, 0, Root);
            true->
                To ! {reduce2, V1+Val2},
                doreduce(Index, To, 0, 0, Root)
            end;
        {reduce2, V2} when Val1 > 0, Val2 == 0 ->
            if Index == 1 ->% root node
                Root !{result, Val1 + V2},
                doreduce(Index, To, 0, 0, Root);
            Index rem 2 == 0 ->
                To ! {reduce1, V2+Val1},
                doreduce(Index, To, 0, 0, Root);
            true->
                To ! {reduce2, V2+Val1},
                doreduce(Index, To, 0, 0, Root)
            end;
        {reduce1, V1} when Val1 == 0, Val2 == 0 ->
            doreduce(Index, To, V1, 0, Root);
        {reduce2, V2} when Val1 == 0, Val2 == 0 ->
            doreduce(Index, To, 0, V2, Root);
        true->
            true
    end.
domap(Index)->
    receive
       {connect, ReduceId}->
            domap(Index, ReduceId)
    end.
domap(Index, To)->
    receive
        {mapper, V}->
            To !{map, V*V},
            domap(Index, To);
        true->
            true
    end.

尽管这对 Erlang 来说根本不是一个好任务,但有一个非常简单的解决方案:

-module(mr).
-export([start/1, start/2]).
start([R, N]) ->
    Result = start(list_to_integer(R), list_to_integer(N)),
    io:format("~B x ~B~n", [length(Result), hd(Result)]).
start(R, N) ->
    Self = self(),
    Reducer = start(Self, R, 1, N),
    [ receive {Reducer, Result} -> Result end || _ <- lists:seq(1, R) ].
start(Parent, R, N, N) ->
    spawn_link(fun() -> mapper(Parent, R, N) end);
start(Parent, R, From, To) ->
    spawn_link(fun() -> reducer(Parent, R, From, To) end).
mapper(Parent, R, N) ->
    [ Parent ! {self(), N*N}  || _ <- lists:seq(1, R) ].
reducer(Parent, R, From, To) ->
    Self = self(),
    Middle = ( From + To ) div 2,
    A = start(Self, R, From, Middle),
    B = start(Self, R, Middle + 1, To),
    [ Parent ! {Self, receive {A, X} -> receive {B, Y} -> X+Y end end}
      || _ <- lists:seq(1, R) ].

您可以使用以下命令运行它

$ erlc -W mr.erl
$ time erl -noshell -run mr start 1024 1024 -s init stop
1024 x 358438400
real    0m2.162s
user    0m4.177s
sys     0m0.151s

但大多数时候是 VM 启动和优雅停止开销

$ time erl -noshell -run mr start 1024 1024 -s erlang halt
1024 x 358438400
real    0m1.172s
user    0m4.110s
sys     0m0.150s
$ erl
1> timer:tc(fun() -> mr:start(1024,1024) end).
{978453,
 [358438400,358438400,358438400,358438400,358438400,
  358438400,358438400,358438400,358438400,358438400,358438400,
  358438400,358438400,358438400,358438400,358438400,358438400,
  358438400,358438400,358438400,358438400,358438400,358438400,
  358438400,358438400,358438400,358438400|...]}

请记住,它更像是一个优雅的解决方案,而不是一个有效的解决方案。有效的解决方案应该平衡减少树分支与通信开销。

最新更新