我有两个。csv文件
A.csv:
A,B,C,D,E
1,2,3,4,5
5,4,3,2,1
B.csv
A,E,B,C,F
6,7,8,9,1
4,3,4,5,6
我想在Haskell中阅读它们,对变量A
,B
和C
有严格的解析规则。然后,我想将复杂的合并和过滤操作应用于A.csv的行。和B.csv并创建一个文件C.csv从结果来看。本文末尾的代码块基本上涵盖了这个功能。
问题:
我现在想做所有这一切,同时保持变量D
,E
和F
左右。在我的真实数据集中,我有一个未知和任意数量的这样的附加列。我不能轻易地在各自的数据类型中表示它们(下面的ABC
)。所有这些都应该保留并在输出数据集中得到适当的表示。
使用下面的代码,C.csv如下所示:
A,B,C
1,2,3
5,4,3
6,8,9
4,4,5
我希望得到这样的结果:
A,B,C,D,E,F
1,2,3,4,5,_
5,4,3,2,1,_
6,8,9,_,7,1
4,4,5,_,3,6
有办法用木薯做到这一点吗?我是否必须从头编写自定义解析器来获得此功能?我该怎么做呢?
这个示例代码缺少所需的特性。它是一个独立的堆栈脚本。
#!/usr/bin/env stack
-- stack --resolver lts-18.7 script --package cassava,bytestring,vector
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
import qualified Data.ByteString.Lazy as B
import qualified Data.Csv as C
import qualified Data.Vector as V
data ABC = ABC {a :: Int, b :: Int, c :: Int} deriving Show
instance C.FromNamedRecord ABC where
parseNamedRecord m =
ABC <$> m C..: "A" <*> m C..: "B" <*> m C..: "C"
instance C.ToNamedRecord ABC where
toNamedRecord ABC {..} =
C.namedRecord ["A" C..= a, "B" C..= b, "C" C..= c]
decodeABC :: B.ByteString -> [ABC]
decodeABC x =
case C.decodeByName x of
Left err -> error err
Right (_,xs) -> V.toList xs
header :: C.Header
header = V.fromList ["A", "B", "C"]
main :: IO ()
main = do
fileA <- B.readFile "A.csv"
fileB <- B.readFile "B.csv"
let decodedA = decodeABC fileA
let decodedB = decodeABC fileB
putStrLn $ show decodedA
putStrLn $ show decodedB
B.writeFile "C.csv" $ C.encodeByName header (decodedA ++ decodedB)
这段代码包含了所需的功能(感谢@Daniel Wagner的输入):
#!/usr/bin/env stack
-- stack --resolver lts-18.7 script --package cassava,bytestring,vector,unordered-containers
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
import qualified Data.ByteString.Lazy as B
import qualified Data.Csv as C
import qualified Data.HashMap.Strict as HM
import qualified Data.Vector as V
data ABC = ABC {a :: Int, b :: Int, c :: Int, addCols :: C.NamedRecord} deriving Show
abcDefinedCols = ["A", "B", "C"]
abcRefHashMap = HM.fromList $ map (x -> (x, ())) abcDefinedCols
instance C.FromNamedRecord ABC where
parseNamedRecord m =
pure ABC
<*> m C..: "A"
<*> m C..: "B"
<*> m C..: "C"
<*> pure (m `HM.difference` abcRefHashMap)
instance C.ToNamedRecord ABC where
toNamedRecord m =
(addCols m) `HM.union` C.namedRecord ["A" C..= a m, "B" C..= b m, "C" C..= c m]
decodeABC :: B.ByteString -> [ABC]
decodeABC x =
case C.decodeByName x of
Left err -> error err
Right (_,xs) -> V.toList xs
makeCompleteHeader :: [ABC] -> C.Header
makeCompleteHeader ms = V.fromList $ abcDefinedCols ++ HM.keys (HM.unions (map addCols ms))
combineABCs :: [ABC] -> [ABC] -> [ABC]
combineABCs xs1 xs2 =
let simpleSum = xs1 ++ xs2
addColKeys = HM.keys (HM.unions (map addCols simpleSum))
toAddHashMap = HM.fromList (map (k -> (k, "n/a")) addColKeys)
in map (x -> x { addCols = fillAddCols (addCols x) toAddHashMap }) simpleSum
where
fillAddCols :: C.NamedRecord -> C.NamedRecord -> C.NamedRecord
fillAddCols cur toAdd = HM.union cur (toAdd `HM.difference` cur)
main :: IO ()
main = do
fileA <- B.readFile "A.csv"
fileB <- B.readFile "B.csv"
let decodedA = decodeABC fileA
let decodedB = decodeABC fileB
putStrLn $ show decodedA
putStrLn $ show decodedB
let ab = combineABCs decodedA decodedB
B.writeFile "C.csv" $ C.encodeByName (makeCompleteHeader ab) ab
data ABCPlus = ABCPlus { a :: Int, b :: Int, c :: Int, d :: NamedRecord } deriving Show
instance FromNamedRecord ABCPlus where
parseNamedRecord m = pure ABC
<*> m .: "A"
<*> m .: "B"
<*> m .: "C"
<*> pure m -- or perhaps: pure (m `HM.difference` HM.fromList [("A", ()), ("B", ()), ("C", ())])
instance ToNamedRecord ABCPlus where
toNamedRecord m = d m -- or perhaps: d m `HM.union` namedRecord ["A" .= a m, "B" .= b m, "C" .= c m]
headers :: [ABCPlus] -> Header
headers ms = header $ ["A", "B", "C"] ++ HM.keys (relevant combined) where
relevant m = m `HM.difference` HM.fromList [("A", ()), ("B", ()), ("C", ())] -- or perhaps: m
combined = HM.unions [relevantKeys (d m) | m <- ms]