使用map和ByteString键对折叠进行性能分析

我有一个小脚本可以从apache日志文件中读取、解析和导出一些有趣的（实际上不是）统计信息。到目前为止，我已经做了两个简单的选项，日志文件中所有请求发送的字节总数，以及最常见的前10个IP地址。

第一个"模式"只是所有解析字节的简单总和。第二个是折叠地图（Data.map），使用insertWith (+) 1'来计算出现次数。

第一个正如我所期望的那样运行，大部分时间都花在解析上，在恒定的空间中。

42359709344字节分配在堆GC期间复制了72405840字节113712字节最大驻留时间（1553个样本）最大斜率145872字节使用中的总内存为2 MB（由于碎片而丢失0 MB）

第0代：76311个集合，
0并行，0.89秒，0.99s已过去
第1代：1553个集合，0并行，经过0.21s，0.22s

INIT时间0.00s（0.00s经过）MUT时间21.76s(24.82秒）GC时间1.10s（1.20秒）退出时间
0.00s（经过0.00s）总时间22.87s（经过26.02s）

%GC时间4.8%（经过4.6%）

分配速率1946258962字节每MUT秒

生产力占总用户的95.2%，总运行时间的83.6%

然而，第二个没有！

中分配了49398834152个字节堆GC期间复制了580579208字节718385088字节最大驻留时间（15个样本）134532128字节最大斜率使用中的总内存为1393 MB（由于碎片而丢失172 MB）

第0代：91275个集合，
0平行，252.65s，254.46s已过去
第1代：15个集合，0平行，0.12s，0.12s经过

INIT时间0.00s（0.00s经过）MUT时间41.11s(48.87秒）GC时间252.77秒（254.58秒）退出时间
0.00秒（经过0.01秒）总时间293.88秒（经过303.45秒）

%GC时间86.0%（83.9%）

分配速率1201635385字节每MUT秒

生产力占总用户的14.0%，总运行的13.5%

这是代码。

{-# LANGUAGE OverloadedStrings #-}
module Main where
import qualified Data.Attoparsec.Lazy as AL
import Data.Attoparsec.Char8 hiding (space, take)
import qualified Data.ByteString.Char8 as S
import qualified Data.ByteString.Lazy.Char8 as L
import Control.Monad (liftM)
import System.Environment (getArgs)
import Prelude hiding (takeWhile)
import qualified Data.Map as M
import Data.List (foldl', sortBy)
import Text.Printf (printf)
import Data.Maybe (fromMaybe)
type Command = String
data LogLine = LogLine {
    getIP     :: S.ByteString,
    getIdent  :: S.ByteString,
    getUser   :: S.ByteString,
    getDate   :: S.ByteString,
    getReq    :: S.ByteString,
    getStatus :: S.ByteString,
    getBytes  :: S.ByteString,
    getPath   :: S.ByteString,
    getUA     :: S.ByteString
} deriving (Ord, Show, Eq)
quote, lbrack, rbrack, space :: Parser Char
quote  = satisfy (== '"')
lbrack = satisfy (== '[')
rbrack = satisfy (== ']')
space  = satisfy (== ' ')
quotedVal :: Parser S.ByteString
quotedVal = do
    quote
    res <- takeTill (== '"')
    quote
    return res
bracketedVal :: Parser S.ByteString
bracketedVal = do
    lbrack
    res <- takeTill (== ']')
    rbrack
    return res
val :: Parser S.ByteString
val = takeTill (== ' ')
line :: Parser LogLine
l    ine = do
    ip <- val
    space
    identity <- val
    space
    user <- val
    space
    date <- bracketedVal
    space
    req <- quotedVal
    space
    status <- val
    space
    bytes <- val
    (path,ua) <- option ("","") combined
    return $ LogLine ip identity user date req status bytes path ua
combined :: Parser (S.ByteString,S.ByteString)
combined = do
    space
    path <- quotedVal
    space
    ua <- quotedVal
    return (path,ua)
countBytes :: [L.ByteString] -> Int
countBytes = foldl' count 0
    where
        count acc l = case AL.maybeResult $ AL.parse line l of
            Just x  -> (acc +) . maybe 0 fst . S.readInt . getBytes $ x
            Nothing -> acc
countIPs :: [L.ByteString] -> M.Map S.ByteString Int
countIPs = foldl' count M.empty
    where
        count acc l = case AL.maybeResult $ AL.parse line l of
            Just x -> M.insertWith' (+) (getIP x) 1 acc
            Nothing -> acc
---------------------------------------------------------------------------------
main :: IO ()
main = do
  [cmd,path] <- getArgs
  dispatch cmd path
pretty :: Show a => Int -> (a, Int) -> String
pretty i (bs, n) = printf "%d: %s, %d" i (show bs) n
dispatch :: Command -> FilePath -> IO ()
dispatch cmd path = action path
    where
        action = fromMaybe err (lookup cmd actions)
        err    = printf "Error: %s is not a valid command." cmd
actions :: [(Command, FilePath -> IO ())]
actions = [("bytes", countTotalBytes)
          ,("ips",  topListIP)]
countTotalBytes :: FilePath -> IO ()
countTotalBytes path = print . countBytes . L.lines =<< L.readFile path
topListIP :: FilePath -> IO ()
topListIP path = do
    f <- liftM L.lines $ L.readFile path
    let mostPopular (_,a) (_,b) = compare b a
        m = countIPs f
    mapM_ putStrLn . zipWith pretty [1..] . take 10 . sortBy mostPopular . M.toList $ m

编辑：

添加+RS-A16M可将GC降低至20%。记忆的使用当然没有改变。

我建议对代码进行以下更改：

@@ -1,4 +1,4 @@
-{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE BangPatterns, OverloadedStrings #-}
 module Main where
@@ -9,7 +9,7 @@
 import Control.Monad (liftM)
 import System.Environment (getArgs)
 import Prelude hiding (takeWhile)
-import qualified Data.Map as M
+import qualified Data.HashMap.Strict as M
 import Data.List (foldl', sortBy)
 import Text.Printf (printf)
 import Data.Maybe (fromMaybe)
@@ -17,15 +17,15 @@
 type Command = String
 data LogLine = LogLine {
-    getIP     :: S.ByteString,
-    getIdent  :: S.ByteString,
-    getUser   :: S.ByteString,
-    getDate   :: S.ByteString,
-    getReq    :: S.ByteString,
-    getStatus :: S.ByteString,
-    getBytes  :: S.ByteString,
-    getPath   :: S.ByteString,
-    getUA     :: S.ByteString
+    getIP     :: !S.ByteString,
+    getIdent  :: !S.ByteString,
+    getUser   :: !S.ByteString,
+    getDate   :: !S.ByteString,
+    getReq    :: !S.ByteString,
+    getStatus :: !S.ByteString,
+    getBytes  :: !S.ByteString,
+    getPath   :: !S.ByteString,
+    getUA     :: !S.ByteString
 } deriving (Ord, Show, Eq)
 quote, lbrack, rbrack, space :: Parser Char
@@ -39,14 +39,14 @@
     quote
     res <- takeTill (== '"')
     quote
-    return res
+    return $! res
 bracketedVal :: Parser S.ByteString
 bracketedVal = do
     lbrack
     res <- takeTill (== ']')
     rbrack
-    return res
+    return $! res
 val :: Parser S.ByteString
 val = takeTill (== ' ')
@@ -67,14 +67,14 @@
     space
     bytes <- val
     (path,ua) <- option ("","") combined
-    return $ LogLine ip identity user date req status bytes path ua
+    return $! LogLine ip identity user date req status bytes path ua
 combined :: Parser (S.ByteString,S.ByteString)
 combined = do
     space
-    path <- quotedVal
+    !path <- quotedVal
     space
-    ua <- quotedVal
+    !ua <- quotedVal
     return (path,ua)
 countBytes :: [L.ByteString] -> Int
@@ -84,11 +84,11 @@
             Just x  -> (acc +) . maybe 0 fst . S.readInt . getBytes $ x
             Nothing -> acc
-countIPs :: [L.ByteString] -> M.Map S.ByteString Int
+countIPs :: [L.ByteString] -> M.HashMap S.ByteString Int
 countIPs = foldl' count M.empty
     where
         count acc l = case AL.maybeResult $ AL.parse line l of
-            Just x -> M.insertWith' (+) (getIP x) 1 acc
+            Just x -> M.insertWith (+) (getIP x) 1 acc
             Nothing -> acc
 ---------------------------------------------------------------------------------

我使LogLine的字段严格，以避免它们包含引用与解析相关的表达式的thunk。除非你真的需要它们变得懒惰，否则让字段变得严格是一种很好的做法。

我确保尽快创建解析结果（这是更改的$!部分），以避免在实际检查LogLine的各个字段之前延迟解析。

最后，我切换到一个更好的数据结构，来自无序容器包的HashMap。注意，Data.HashMap.Strict中的所有函数都是值严格的，这意味着我们可以使用普通的insertWith变体。

注意，由于共享底层存储（这与Java的String相同），使用ByteString的子字符串会强制将原始字符串保留在内存中。如果要确保没有保留额外内存，请使用bytestring软件包中的copy函数。您可以尝试根据(getIP x)的结果调用copy，看看这是否有什么不同。这里的权衡是使用一些额外的计算来复制字符串，以换取更低的空间使用率。

请注意，使用-A<high number>倾向于提高短期运行程序（即基准测试）的性能，但不一定要提高实际程序的性能。-H也是如此。至少较高的-H值（例如1G）不会影响程序的性能。

最明显的一点是，第一个脚本一看到数据就可以丢弃，而第二个脚本必须保留所看到的一切。因此，您希望第二个脚本至少占用O（N）内存，而第一个脚本可以在恒定空间中运行。

您是否尝试过在启用堆分析的情况下运行？我可以对代码中可能发生超额分配的地方进行一些尝试，但没有什么可以替代硬数据。

我自己也会怀疑地盯着Data.Map.insertWith的调用，因为每一个调用都会使现存的Map盈余中的一大块满足需求，并且需要复制和重新平衡，但这纯粹是我的猜测。如果insertWith调用是罪魁祸首，那么由于您不需要间隙映射条目，可能会更快地在一次遍历中构建整个映射（不需要任何增量来计数IP），然后再进行第二次遍历来进行计数。这样就不会浪费时间重新平衡地图。您还可以利用这样一个事实，即您的密钥数据类型适合Int（好吧，如果它至少是IPv4地址，也可以），并使用Data.IntMap，这样内存开销会低得多。

相关内容

最新更新

热门标签：