Merge pull request #39 from zouroboros/feature/pajek-support

Pajek parser, writer and test cases for pangraph.
tuura · Feb 7, 2019 · a6c4701 · a6c4701
2 parents 029bb0c + 4de4cd8
commit a6c4701
Show file tree

Hide file tree

Showing 6 changed files with 409 additions and 9 deletions.
diff --git a/pangraph.cabal b/pangraph.cabal
@@ -1,12 +1,12 @@
 name:                 pangraph
 version:              0.2.1
-synopsis:             A set of parsers for graph languages and conversions to 
+synopsis:             A set of parsers for graph languages and conversions to
                       graph libaries.
-description:          A package allowing parsing of graph files into graph 
-                      library datatypes. With aim the cope with large networks 
-                      and provide translations between graph libraries. Like a 
-                      pandoc but for graphs. This is my first library so any 
-                      feedback and help is appreicated. For example use please 
+description:          A package allowing parsing of graph files into graph
+                      library datatypes. With aim the cope with large networks
+                      and provide translations between graph libraries. Like a
+                      pandoc but for graphs. This is my first library so any
+                      feedback and help is appreicated. For example use please
                       see the homepage.
 homepage:             https://github.com/tuura/pangraph#readme
 license:              BSD3
@@ -38,6 +38,9 @@ library
                    ,  Pangraph.Gml.Ast
                    ,  Pangraph.Gml.Parser
                    ,  Pangraph.Gml.Writer
+                   ,  Pangraph.Pajek.Ast
+                   ,  Pangraph.Pajek.Parser
+                   ,  Pangraph.Pajek.Writer
   build-depends:      base >= 4.8 && < 5
                    ,  algebraic-graphs    == 0.2.*
                    ,  attoparsec          == 0.13.*
@@ -60,11 +63,11 @@ test-suite pangraph-test
                    ,  Show
                    ,  TestPangraph
                    ,  Gml
+                   ,  Pajek
   build-depends:      base >= 4.8 && < 5
                    ,  containers
                    ,  bytestring
                    ,  HUnit
-                   ,  pangraph 
+                   ,  pangraph
   default-language:   Haskell2010
   GHC-options:        -Wall -fwarn-tabs -fbreak-on-exception
-
diff --git a/src/Pangraph/Pajek/Ast.hs b/src/Pangraph/Pajek/Ast.hs
@@ -0,0 +1,37 @@
+module Pangraph.Pajek.Ast where
+{-|
+Module          : Pangraph.Pajek.Ast
+Description     : Abstract syntax tree for pajek (.net) files
+-}
+data Pajek a = Graph (Maybe a) [Vertex a] [Edge] deriving (Show, Eq, Ord)
+
+type Vertex a = (Integer, Maybe a)
+data Edge = Unweighted Bool Integer Integer
+  | Weighted Bool Integer Integer Double deriving (Show, Eq, Ord)
+
+name :: Pajek a -> Maybe a
+name (Graph n _ _) = n
+
+vertices :: Pajek a -> [Vertex a]
+vertices (Graph _ vs _) = vs
+
+edges :: Pajek a -> [Edge]
+edges (Graph _ _ ed) = ed
+
+label :: Vertex a -> Maybe a
+label (_, l) = l
+
+vertexId :: Vertex a -> Integer
+vertexId (i, _) = i
+
+endpoints :: Edge -> (Integer, Integer)
+endpoints (Unweighted _ n1 n2) = (n1, n2)
+endpoints (Weighted _ n1 n2 _) = (n1, n2)
+
+weight :: Edge -> Maybe Double
+weight (Unweighted _ _ _) = Nothing
+weight (Weighted _ _ _ w) = Just w
+
+directed :: Edge -> Bool
+directed (Unweighted d _ _) = d
+directed (Weighted d _ _ _) = d
diff --git a/src/Pangraph/Pajek/Parser.hs b/src/Pangraph/Pajek/Parser.hs
@@ -0,0 +1,194 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-|
+Module          : Pangraph.Pajek.Parser
+Description     : Parse pajek (.net) files
+
+Functions for parseing pajek files.
+Roughly follows the specification at <https://gephi.org/users/supported-graph-formats/pajek-net-format/>
+
+All input is assumed to be encoded in UTF8
+-}
+module Pangraph.Pajek.Parser (parse, parsePajek, pajekToPangraph) where
+
+import Data.Attoparsec.Text hiding (parse)
+import qualified Data.Text as T
+import Data.Text.Encoding (decodeUtf8)
+import Control.Applicative ((<|>), (<*))
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Char8 as BC
+import qualified Data.Text.Encoding as EC
+import Data.Char (isSpace)
+import Prelude hiding (takeWhile, id, lines, unlines, reverse)
+
+import qualified Pangraph as P
+import Pangraph.Pajek.Ast
+
+-- | Parses the 'ByteString' into a 'Pangraph'.
+--   Please note that this function treats directed and undirected edges in same
+--   way. So if the panjek file contains an arc (undirected edge) between node 1
+--   and node 2 the resulting pangraph will only contain an edge between 1 and 2.
+parse :: B.ByteString -> Maybe P.Pangraph
+parse contents = (parsePajek contents) >>= pajekToPangraph
+
+-- | Converts a 'Pajek' into a 'Pangraph'
+pajekToPangraph :: Pajek T.Text -> Maybe P.Pangraph
+pajekToPangraph pajek = let
+    pVertices = map (vertexToPangraph) (vertices pajek)
+    pEdges = map (edgeToPangraph) (edges pajek)
+    in P.makePangraph pVertices pEdges
+
+vertexToPangraph :: Vertex T.Text -> P.Vertex
+vertexToPangraph (vid, label') =
+    P.makeVertex (BC.pack (show vid))
+    (maybe [] (\l -> [("label", (EC.encodeUtf8 l))]) label')
+
+edgeToPangraph :: Edge -> P.Edge
+edgeToPangraph (Unweighted _ start end) =
+    P.makeEdge (BC.pack (show start), BC.pack (show end)) []
+edgeToPangraph (Weighted _ start end w) =
+    P.makeEdge (BC.pack (show start), BC.pack (show end))
+        [("weight", BC.pack (show w))]
+
+-- | Parses the 'ByteString' into a 'Pajek' ast.
+parsePajek :: B.ByteString -> Maybe (Pajek T.Text)
+parsePajek contents = either (const Nothing) Just
+      (parseText (decodeUtf8 contents))
+
+parseText :: T.Text -> Either String (Pajek T.Text)
+parseText = parseOnly (pajekParser <* endOfInput) . removeComments
+
+removeComments :: T.Text -> T.Text
+removeComments text = T.unlines (filter (not . T.isPrefixOf "%") (T.lines text))
+
+pajekParser :: Parser (Pajek T.Text)
+pajekParser = do
+    gname <- option Nothing (fmap (Just) networkHeader)
+    _ <- verticesHeader
+    vs <- vertexList
+    skipSpace
+    edges' <- sepBy (choice [arcsAsPairs, arcsList, edgesAsPairs, edgesList])
+        skipSpace
+    _ <- manyTill (skipRest) endOfInput
+    return (Graph gname vs (concat edges'))
+
+networkHeader :: Parser (T.Text)
+networkHeader = do
+    star
+    _ <- asciiCI "network"
+    skipSpace
+    gname <- fmap unquoted line
+    endOfLine
+    return gname
+
+verticesHeader :: Parser Integer
+verticesHeader = do
+    star
+    _ <- asciiCI "vertices"
+    skipSpace
+    n <- decimal
+    skipRest
+    return n
+
+vertexList :: Parser [Vertex T.Text]
+vertexList = sepBy vertexEntry endOfLine
+
+vertexEntry :: Parser (Vertex T.Text)
+vertexEntry = do
+    skipSpace
+    vid <- decimal
+    skipSpace
+    label' <- option Nothing $ fmap (Just) maybeQuoted
+    _ <- line
+    return (vid, label')
+
+arcsAsPairs :: Parser [Edge]
+arcsAsPairs = do
+    star
+    _ <- asciiCI "arcs"
+    _ <- line
+    endOfLine
+    pairs <- sepBy (pair False) endOfLine
+    return pairs
+
+edgesAsPairs :: Parser [Edge]
+edgesAsPairs = do
+    star
+    _ <- asciiCI "edges"
+    skipRest
+    pairs <- sepBy (pair True) endOfLine
+    return pairs
+
+edgesList :: Parser [Edge]
+edgesList = do
+    star
+    _ <- asciiCI "edgeslist"
+    skipRest
+    adjacencyList True
+
+arcsList :: Parser [Edge]
+arcsList = do
+    star
+    _ <- asciiCI "arcslist"
+    skipRest
+    adjacencyList False
+
+adjacencyList :: Bool -> Parser [Edge]
+adjacencyList directed' = do
+    edges' <- sepBy edgeList endOfLine
+    let flatEdges = concatMap (\(s, tids) -> map (\t -> (s, t)) tids) edges'
+    return (map (uncurry (Unweighted directed')) flatEdges)
+
+edgeList :: Parser (Integer, [Integer])
+edgeList = do
+    _ <- option ' ' (char '-')
+    sId <- decimal
+    skipSpace1
+    tIds <- sepBy decimal skipSpace1
+    _ <- line
+    return (sId, tIds)
+
+pair :: Bool -> Parser Edge
+pair directed' = do
+    skipSpace
+    source <- decimal
+    _ <- skipSpace1
+    target <- decimal
+    edge <- option (Unweighted directed' source target)
+                (fmap (Weighted directed' source target) edgeWeight)
+    _ <- line
+    return edge
+
+edgeWeight :: Parser Double
+edgeWeight = do
+    _ <- skipSpace1
+    w <- double
+    return w
+
+star :: Parser ()
+star = skip (== '*')
+
+line :: Parser T.Text
+line = takeTill (isEndOfLine)
+
+maybeQuoted :: Parser T.Text
+maybeQuoted = quoted <|> (fmap unquoted (takeTill isSpace))
+
+quoted :: Parser T.Text
+quoted = do
+    _ <- satisfy (== '\"')
+    t <- takeTill (== '\"')
+    _ <- satisfy (== '\"')
+    return t
+
+unquoted :: T.Text -> T.Text
+unquoted = T.filter (/= '\"')
+
+skipSpace1 :: Parser ()
+skipSpace1 = do
+    _ <- takeWhile1 isHorizontalSpace
+    return ()
+
+skipRest :: Parser ()
+skipRest = do
+    _ <- line
+    endOfLine
diff --git a/src/Pangraph/Pajek/Writer.hs b/src/Pangraph/Pajek/Writer.hs
@@ -0,0 +1,77 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-|
+Module          : Pangraph.Pajek.Writer
+Description     : Write pajek (.net) files
+
+Functions for writeing pajek files.
+Roughly follows the specification <https://gephi.org/users/supported-graph-formats/pajek-net-format/>
+-}
+module Pangraph.Pajek.Writer (write, pangraphToPajek, writePajek) where
+
+import Data.Maybe
+import Data.ByteString (ByteString)
+import qualified Data.ByteString.Char8 as BC
+import Data.ByteString.Builder
+import qualified Data.ByteString.Lazy as L
+import Data.Monoid
+import Prelude hiding (concat, (++))
+
+import qualified Pangraph as P
+import Pangraph.Pajek.Ast
+
+-- | Writes a 'Pangraph' into a bytestring.
+-- | Since there is no way of knowing weather a edge is directed or undirected
+-- | in a 'Pangraph' all edges are written as directed edges.
+write :: P.Pangraph -> ByteString
+write = writePajek . pangraphToPajek
+
+-- | Converts a 'Pangraph' into a 'Pajek' graph.
+pangraphToPajek :: P.Pangraph -> Pajek ByteString
+pangraphToPajek graph = let
+  pEdges = P.edgeList graph
+  pVertices = P.vertexList graph
+  in Graph Nothing (map vertexToPajek pVertices) (map edgeToPajek pEdges)
+
+vertexToPajek :: P.Vertex -> Vertex ByteString
+vertexToPajek vertex = let
+  label' = fmap snd $ listToMaybe $ filter ((== "label") . fst)
+    (P.vertexAttributes vertex)
+  vId = read $ BC.unpack $ P.vertexID vertex
+  in (vId, label')
+
+edgeToPajek :: P.Edge -> Edge
+edgeToPajek edge = let
+  weight' = fmap (read . BC.unpack . snd) $ listToMaybe $ filter ((== "weight") . fst)
+    (P.edgeAttributes edge)
+  (source', target') = P.edgeEndpoints edge
+  source = read $ BC.unpack source'
+  target = read $ BC.unpack target'
+  in maybe (Unweighted False source target) (\w -> Weighted False source target w) weight'
+
+vertexLine :: Vertex ByteString -> Builder
+vertexLine vertex = let
+    vId = show $ vertexId vertex
+    label' = maybe (stringUtf8 "")
+        (\s -> charUtf8 '\"' <> byteString s <> charUtf8 '\"') (label vertex)
+    in stringUtf8 vId <> charUtf8 ' ' <> label'
+
+edgeLine :: Edge -> Builder
+edgeLine edge = let
+    (source, target) = endpoints edge
+    weight' = maybe (stringUtf8 "")
+        (\w -> charUtf8 ' ' <> stringUtf8 (show w)) (weight edge)
+    in stringUtf8 (show source) <> charUtf8 ' ' <> stringUtf8 (show target)
+        <> weight'
+
+pajekBuilder :: Pajek ByteString -> Builder
+pajekBuilder pajek = do
+  let vertexCount = length (vertices pajek)
+  let edgeCount = length (edges pajek)
+  stringUtf8 "*Vertices " <> stringUtf8 (show vertexCount) <> charUtf8 '\n'
+    <> mconcat [vertexLine v <> charUtf8 '\n' | v <- vertices pajek]
+    <> "*Arcs " <> stringUtf8 (show edgeCount) <> charUtf8 '\n'
+    <> mconcat [edgeLine e <> charUtf8 '\n' | e <- edges pajek]
+
+-- | Writes a 'Pajek' into a 'ByteString'
+writePajek :: Pajek ByteString -> ByteString
+writePajek pajek = L.toStrict (toLazyByteString (pajekBuilder pajek))
diff --git a/test/Main.hs b/test/Main.hs
@@ -6,8 +6,9 @@ import Containers
 import TestPangraph
 import FGL
 import Gml
+import Pajek
 
 import Test.HUnit
 
 main :: IO Counts
-main = (runTestTT . TestList . concat) [containersTests, fglTests, graphmlTests, showTests, pangraphTests, gmlTests]
+main = (runTestTT . TestList . concat) [containersTests, fglTests, graphmlTests, showTests, pangraphTests, gmlTests, pajekTests]