From 13b34530099069e4bf5b397088910ae6d0205b93 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Wed, 4 Dec 2024 12:18:13 -0800 Subject: [PATCH] Add mdoc reader This change introduces a reader for mdoc, a roff-derived semantic markup language for manual pages. The two relevant contemporary implementations of mdoc for manual pages are mandoc (https://mandoc.bsd.lv/), which implements the language from scratch in C, and groff (https://www.gnu.org/software/groff/), which implements it as roff macros. mdoc has a lot of semantics specific to technical manuals that aren't representable in Pandoc's AST. I've taken a cue from the mandoc HTML output and many mdoc elements are encoded as Codes or Spans with classes named for the mdoc macro that produced them. Much like web browsers with HTML, mandoc attempts to produce best-effort output given all kinds of weird and crappy mdoc input. Part of the reason it's able to do this is it uses a very accommodating parse tree and stateful output routines specialized to the output mode, and when it encounters some macro it wasn't expecting, it can easily give up on whatever it was outputting and output something else. I've encoded as much flexibility as I reasonably could into the mdoc reader here, but I don't know how to be as flexible as mandoc. This branch has been developed almost exclusively against mandoc's documentation and implementation of mdoc as a reference, and the real-world manual pages tested against are those from the OpenBSD base system. Of ~3500 manuals in mdoc format shipped with a fresh OpenBSD install, 17 cause the mdoc reader to exit with a parse error. Any further chasing of edge cases is deferred to future work. Many of the tests in test/Tests/Readers/Mdoc.hs are derived directly from mandoc's extensive regression tests. [API change] Adds readMdoc to the public API --- pandoc.cabal | 5 + src/Text/Pandoc/Readers.hs | 3 + src/Text/Pandoc/Readers/Mdoc.hs | 1469 +++++++++++++++++++++ src/Text/Pandoc/Readers/Mdoc/Lex.hs | 225 ++++ src/Text/Pandoc/Readers/Mdoc/Macros.hs | 183 +++ src/Text/Pandoc/Readers/Mdoc/Standards.hs | 69 + src/Text/Pandoc/Readers/Roff/Escape.hs | 4 +- test/Tests/Readers/Mdoc.hs | 321 +++++ test/command/mdoc-An.md | 17 + test/command/mdoc-Bd-unfilled.md | 24 + test/command/mdoc-Bl-column.md | 22 + test/command/mdoc-Bl-tag.md | 36 + test/test-pandoc.hs | 2 + 13 files changed, 2378 insertions(+), 2 deletions(-) create mode 100644 src/Text/Pandoc/Readers/Mdoc.hs create mode 100644 src/Text/Pandoc/Readers/Mdoc/Lex.hs create mode 100644 src/Text/Pandoc/Readers/Mdoc/Macros.hs create mode 100644 src/Text/Pandoc/Readers/Mdoc/Standards.hs create mode 100644 test/Tests/Readers/Mdoc.hs create mode 100644 test/command/mdoc-An.md create mode 100644 test/command/mdoc-Bd-unfilled.md create mode 100644 test/command/mdoc-Bl-column.md create mode 100644 test/command/mdoc-Bl-tag.md diff --git a/pandoc.cabal b/pandoc.cabal index 2896d1ee513f..089379253cf7 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -596,6 +596,7 @@ library Text.Pandoc.Readers.EPUB, Text.Pandoc.Readers.Muse, Text.Pandoc.Readers.Man, + Text.Pandoc.Readers.Mdoc, Text.Pandoc.Readers.FB2, Text.Pandoc.Readers.DokuWiki, Text.Pandoc.Readers.Ipynb, @@ -707,6 +708,9 @@ library Text.Pandoc.Readers.LaTeX.Parsing, Text.Pandoc.Readers.LaTeX.SIunitx, Text.Pandoc.Readers.LaTeX.Table, + Text.Pandoc.Readers.Mdoc.Lex, + Text.Pandoc.Readers.Mdoc.Macros, + Text.Pandoc.Readers.Mdoc.Standards, Text.Pandoc.Readers.Typst.Parsing, Text.Pandoc.Readers.Typst.Math, Text.Pandoc.Readers.ODT.Base, @@ -831,6 +835,7 @@ test-suite test-pandoc Tests.Readers.Muse Tests.Readers.Creole Tests.Readers.Man + Tests.Readers.Mdoc Tests.Readers.FB2 Tests.Readers.DokuWiki Tests.Writers.Native diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs index ba5170a10b56..2efd061dffed 100644 --- a/src/Text/Pandoc/Readers.hs +++ b/src/Text/Pandoc/Readers.hs @@ -51,6 +51,7 @@ module Text.Pandoc.Readers , readEPUB , readMuse , readMan + , readMdoc , readFB2 , readIpynb , readCSV @@ -106,6 +107,7 @@ import Text.Pandoc.Readers.TWiki import Text.Pandoc.Readers.Txt2Tags import Text.Pandoc.Readers.Vimwiki import Text.Pandoc.Readers.Man +import Text.Pandoc.Readers.Mdoc import Text.Pandoc.Readers.CSV import Text.Pandoc.Readers.CslJson import Text.Pandoc.Readers.BibTeX @@ -168,6 +170,7 @@ readers = [("native" , TextReader readNative) ,("rtf" , TextReader readRTF) ,("typst" , TextReader readTypst) ,("djot" , TextReader readDjot) + ,("mdoc" , TextReader readMdoc) ] -- | Retrieve reader, extensions based on format spec (format+extensions). diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs new file mode 100644 index 000000000000..2bc61460b7db --- /dev/null +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -0,0 +1,1469 @@ +{-# LANGUAGE CPP #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE ViewPatterns #-} +{- | + Module : Text.Pandoc.Readers.Mdoc + Copyright : © 2024 Evan Silberman + License : GNU GPL, version 2 or above + + Maintainer : Evan Silberman + Stability : WIP + Portability : portable + +Conversion of mdoc to 'Pandoc' document. +-} +module Text.Pandoc.Readers.Mdoc (readMdoc) where + +import Data.Char (isAsciiLower, toUpper) +import Data.Default (Default) +import Data.Either (fromRight) +import Data.Functor (($>)) +import Data.Maybe (catMaybes) +import Control.Monad (mplus, guard, void, when, unless) +import Control.Monad.Except (throwError) +#if MIN_VERSION_base(4,19,0) +import Data.List (intersperse, unsnoc) +#else +import Data.List (intersperse, uncons) +#endif +import qualified Data.Map.Strict as M +import qualified Data.Text as T +import Text.Pandoc.Definition (Pandoc(Pandoc), Meta) +import Text.Pandoc.Builder (Blocks, Inlines) +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Class.PandocMonad (PandocMonad(..)) +import Text.Pandoc.Options +import Text.Pandoc.Parsing hiding (uncons) +import Text.Pandoc.Logging +import Text.Pandoc.Readers.Mdoc.Lex +import Text.Pandoc.Readers.Mdoc.Standards +import Text.Parsec (modifyState) +import qualified Text.Pandoc.Parsing as P +import qualified Data.Foldable as Foldable +import Text.Pandoc.Shared (stringify) + +#if !MIN_VERSION_base(4,19,0) +unsnoc :: [a] -> Maybe ([a], a) +unsnoc xs = (\(hd, tl) -> (reverse tl, hd)) <$> uncons (reverse xs) +#endif + + {- As a general principle, if mandoc -T lint issues a WARNING admonition + or worse about a construct, I consider it fair game for this reader to + do something different than what mandoc does with it, including bailing + out instead of recovering. -} + +data MdocSection + = ShName + | ShSynopsis + | ShAuthors + | ShSeeAlso + | ShOther + deriving (Show, Eq) + +-- Declaration order is important: this is the order fields of a reference +-- are printed by mandoc +data ReferenceField = + Author + | ArticleTitle + | BookTitle + | Publisher + | Journal + | TechReportTitle + | IssueNumber + | VolumeNumber + | Url + | Pages + | Institution + | PubLocation + | PubDate + | Optional + deriving (Show, Eq, Ord, Enum) + +-- mandoc allows specifying multiple of _any_ reference field, and just +-- prints them all out in document order, even though authors are the only +-- field where this is the documented behavior. There's no lint warning +-- about this either. I'd prefer to do last-one-wins for the non-author +-- fields, which would presumably make it easier to transform the +-- bibliographic data into something else, but for now all I'm doing is +-- printing the references out the same way mandoc does. +type MdocReference = M.Map ReferenceField [T.Text] + +data MdocState = MdocState + { readerOptions :: ReaderOptions + , metadata :: Meta + , tableCellsPlain :: Bool + , spacingMode :: Bool + , authorNameSplit :: Bool + , inLineEnclosure :: Bool + , progName :: Maybe T.Text + , currentSection :: MdocSection + , currentReference :: MdocReference + , logMessages :: [LogMessage] + } + deriving (Show) + +instance Default MdocState where + def = + MdocState + { readerOptions = def + , metadata = B.nullMeta + , tableCellsPlain = True + , spacingMode = True + , authorNameSplit = False + , inLineEnclosure = False + , currentSection = ShOther + , currentReference = M.empty + , progName = Nothing + , logMessages = [] + } + +instance HasLogMessages MdocState where + addLogMessage msg st = st{ logMessages = msg : logMessages st } + getLogMessages st = reverse $ logMessages st + +type MdocParser m = P.ParsecT [MdocToken] MdocState m + + +-- | Read mdoc from an input string and return a Pandoc document. +readMdoc :: (PandocMonad m, ToSources a) + => ReaderOptions + -> a + -> m Pandoc +readMdoc opts s = do + let Sources inps = toSources s + tokenz <- mconcat <$> mapM (uncurry lexMdoc) inps + let state = def {readerOptions = opts} :: MdocState + eitherdoc <- readWithMTokens parseMdoc state + (Foldable.toList . unMdocTokens $ tokenz) + either (throwError . fromParsecError (Sources inps)) return eitherdoc + + +readWithMTokens :: PandocMonad m + => ParsecT [MdocToken] MdocState m a -- ^ parser + -> MdocState -- ^ initial state + -> [MdocToken] -- ^ input + -> m (Either ParseError a) +readWithMTokens parser state input = + runParserT parser state "source" input + + +parseMdoc :: PandocMonad m => MdocParser m Pandoc +parseMdoc = do + optional parsePrologue + bs <- many parseBlock <* eof + meta <- metadata <$> getState + let (Pandoc _ blocks) = B.doc $ mconcat bs + reportLogMessages + return $ Pandoc meta blocks + +msatisfy :: Monad m + => (MdocToken -> Bool) -> P.ParsecT [MdocToken] st m MdocToken +msatisfy predic = P.tokenPrim show nextPos testTok + where + testTok t = if predic t then Just t else Nothing + nextPos _ _ (Macro _ pos':_) = pos' + nextPos _ _ (Lit _ pos':_) = pos' + nextPos _ _ (Str _ pos':_) = pos' + nextPos _ _ (Delim _ _ pos':_) = pos' + nextPos _ _ (Blank pos':_) = pos' + nextPos a _ (Eol{}:x:xs) = nextPos a x xs + nextPos pos _ [Eol] = pos + nextPos pos _ [] = pos + +macro :: PandocMonad m => T.Text -> MdocParser m MdocToken +macro name = msatisfy t where + t (Macro n _) = n == name + t _ = False + +anyMacro :: PandocMonad m => MdocParser m MdocToken +anyMacro = msatisfy t where + t (Macro _ _) = True + t _ = False + +emptyMacro :: PandocMonad m => T.Text -> MdocParser m MdocToken +emptyMacro n = macro n <* eol + +delim :: PandocMonad m => DelimSide -> MdocParser m MdocToken +delim side = msatisfy t where + t (Delim s _ _) = side == s + t _ = False + +str :: PandocMonad m => MdocParser m MdocToken +str = msatisfy t where + t Str{} = True + t _ = False + +lit :: PandocMonad m => MdocParser m MdocToken +lit = msatisfy t where + t Lit{} = True + t _ = False + +arg :: PandocMonad m => MdocParser m MdocToken +arg = msatisfy t where + t Lit{} = True + t Macro{} = True + t _ = False + +literal :: PandocMonad m => T.Text -> MdocParser m MdocToken +literal n = msatisfy t where + t (Lit n' _) = n == n' + t _ = False + +blank :: PandocMonad m => MdocParser m MdocToken +blank = msatisfy t where + t Blank{} = True + t _ = False + +eol :: PandocMonad m => MdocParser m () +eol = void $ msatisfy t where + t Eol{} = True + t _ = False + +newControlContext :: MdocToken -> Bool +newControlContext Eol{} = True +newControlContext Macro{} = True +newControlContext Str{} = True +newControlContext Blank{} = True +newControlContext Lit{} = False +newControlContext Delim{} = False + + +inlineContextEnd :: PandocMonad m => MdocParser m () +inlineContextEnd = eof <|> (void . lookAhead $ msatisfy newControlContext) + +sectionEnd :: PandocMonad m => MdocParser m () +sectionEnd = eof <|> (void . lookAhead $ macro "Sh") + +argsToInlines :: PandocMonad m => MdocParser m Inlines +argsToInlines = do + ls <- manyTill arg eol + let strs = map (B.str . toString) ls + spacify strs + +parsePrologue :: PandocMonad m => MdocParser m () +parsePrologue = do + macro "Dd" + date <- argsToInlines + macro "Dt" + (Lit title _) <- lit + (Lit section _) <- lit + arch <- optionMaybe (toString <$> lit) + eol + emptyMacro "Os" + let adjust = B.setMeta "title" (B.str title) + . B.setMeta "date" date + . B.setMeta "section" (B.str section) + . maybe id (B.setMeta "architecture" . B.str) arch + modifyState $ \s -> s{metadata = adjust $ metadata s} + +shToSectionMode :: T.Text -> MdocSection +shToSectionMode "NAME" = ShName +shToSectionMode "SYNOPSIS" = ShSynopsis +shToSectionMode "AUTHORS" = ShAuthors +shToSectionMode "SEE ALSO" = ShSeeAlso +shToSectionMode _ = ShOther + +parseHeader :: PandocMonad m => MdocParser m Blocks +parseHeader = do + (Macro m _) <- lookAhead $ macro "Sh" <|> macro "Ss" + txt <- lineEnclosure m id + let lvl = if m == "Sh" then 1 else 2 + when (lvl == 1) $ modifyState $ \s -> s{currentSection = (shToSectionMode . stringify) txt} + return $ B.header lvl txt + +parseNameSection :: PandocMonad m => MdocParser m Blocks +parseNameSection = do + sec <- currentSection <$> getState + guard $ sec == ShName + nms <- mconcat . intersperse B.space <$> many nameNm + macro "Nd" + desc <- argsToInlines + return $ B.para $ nms <> B.space <> "—" <> B.space <> desc + where + nameNm = do + macro "Nm" + nms <- many1 aNm + eol + return $ mconcat $ intersperse B.space nms + comma = msatisfy $ \case + (Delim _ "," _) -> True + _ -> False + aNm = do + nm <- toString <$> lit + c <- option mempty (toString <$> comma) + modifyState $ \s -> s{progName = mplus (progName s) (Just nm)} + return $ B.code nm <> B.str c + +parseSynopsisSection :: PandocMonad m => MdocParser m Blocks +parseSynopsisSection = do + sec <- currentSection <$> getState + guard $ sec == ShSynopsis + parseSynopsis sectionEnd + +parseMiniSynopsis :: PandocMonad m => MdocParser m Blocks +parseMiniSynopsis = do + macro "nr" + literal "nS" + literal "1" + eol + parseSynopsis (sectionEnd <|> end) + where + end = do + macro "nr" + literal "nS" + literal "0" + eol + return () + +parseSynopsis :: PandocMonad m => MdocParser m () -> MdocParser m Blocks +parseSynopsis end = do + bs <- manyTill synopsisBlock end + return $ mconcat bs + where + synopsisGroup p = B.lineBlock <$> many1 p <* optional (emptyMacro "Pp") + synopsisBlock = synopsisGroup parseInvocation + <|> synopsisGroup (parseCd <* optional eol) + <|> synopsisGroup (parseIn <* optional eol) + <|> synopsisGroup (parseFd <* optional eol) + <|> synopsisGroup (parseVt <* optional eol) + <|> try parseSignature + <|> parseWeirdSignature + <|> parseRegularBlock + parseInvocation = do + nm <- parseNm + optional eol + rest <- many synopsisInline + spacify (nm:rest) + parseSignature = do + ft <- parseFt <* optional eol + sig <- (parseFn <|> parseFo) <* optional eol + return $ B.lineBlock [ft, sig <> ";"] + -- e.g. OpenBSD MB_CUR_MAX(3), mild abuse of notation for Ft + parseWeirdSignature = do + ft <- parseFt <* optional eol + rest <- many synopsisInline + line <- spacify (ft:rest) + return $ B.lineBlock [line] + synopsisInline = parseSmToggle <|> parseStrs <|> (controlLine >>= spacify) "synopsis inlines" + safeEol = do + amNested <- inLineEnclosure <$> getState + unless amNested $ optional eol + controlLine = many1 ((choice otherInlineMacros <|> litsAndDelimsToInlines) <* safeEol) + +parseSeeAlsoSection :: PandocMonad m => MdocParser m Blocks +parseSeeAlsoSection = do + sec <- currentSection <$> getState + guard $ sec == ShSeeAlso + blocks <- many1Till parseSeeAlsoBlock sectionEnd + return $ mconcat blocks + where + parseSeeAlsoBlock = parseRegularBlock <|> (B.para <$> parseRs) + +-- roff(7) says "In text lines, whitespace is preserved within a line." I +-- considered following this rule but it really cuts against the grain of what +-- Pandoc writers want to work with, for no clear benefit. This isn't wholly +-- inconsistent with mandoc, because it makes no effort to render multiple +-- consecutive spaces from the source document in HTML. Hence I call B.text +-- instead of B.str +parseStr :: PandocMonad m => MdocParser m Inlines +parseStr = do + (Str txt _) <- str + return $ B.text txt + +-- It's unclear whether consecutive text lines ought to be affected by the +-- spacing mode. mdoc(7) claims that: +-- +-- > By default, spacing is on. When switched off, no white space is +-- > inserted between macro arguments and between the output generated from +-- > adjacent macros, but text lines still get normal spacing between words +-- > **and sentences.** +-- +-- (emphasis added) +-- This implied to me that while spacing is off, consecutive text lines +-- would have spacing between them as normal. In fact, in mandoc's +-- implementation, they do not: +-- +-- text +-- .Sm off +-- text. +-- text +-- .Sm on +-- text +-- +-- renders as +-- +-- text text.text text +-- +-- (The "." is in there since the allusion in the documentation to +-- sentences made me wonder if that made a difference; it doesn't.) +-- +-- I've chosen to adopt my interpretation of the documented behavior, rather +-- than mandoc's implementation. Multiple consecutive strs within a block get +-- spaces between them and then packed up together, and text lines are not +-- affected by the spacing mode. +-- +-- Reported at https://inbox.vuxu.org/mandoc-discuss/369KFE6SHMXSE.3PS4387AYEFB5@silby.fyi/T/ +parseStrs :: PandocMonad m => MdocParser m Inlines +parseStrs = do + txt <- many1 parseStr + return $ mconcat $ intersperse B.space txt + +parseDelim :: PandocMonad m => DelimSide -> MdocParser m Inlines +parseDelim pos = do + (Delim _ txt _) <- delim pos + return $ B.str txt + +litsToText :: PandocMonad m => MdocParser m [T.Text] +litsToText = do + ls <- many1 lit + return $ map toString ls + +litsToInlines :: PandocMonad m => MdocParser m Inlines +litsToInlines = do + ls <- many1 lit + let strs = map (B.str . toString) ls + spacify strs + +litsAndDelimsToInlines :: PandocMonad m => MdocParser m Inlines +litsAndDelimsToInlines = do + (o, ls, c) <- delimitedArgs $ many lit + guard $ not (null o && null ls && null c) + strs <- spacify $ map (B.str . toString) ls + return $ o <> strs <> c + +openingDelimiters :: PandocMonad m => MdocParser m Inlines +openingDelimiters = do + openDelim <- mconcat <$> many (parseDelim Open) + omids <- pipes + addSpace <- spacingMode <$> getState + let omid | null omids = mempty + | addSpace = omids <> B.space + | otherwise = omids + return $ openDelim <> omid + +pipes :: PandocMonad m => MdocParser m Inlines +pipes = many (parseDelim Middle) >>= spacify + +closingDelimiters :: PandocMonad m => MdocParser m Inlines +closingDelimiters = do + cmids <- pipes + addSpace <- spacingMode <$> getState + let cmid | null cmids = mempty + | addSpace = B.space <> cmids + | otherwise = cmids + closeDelim <- mconcat <$> many (parseDelim Close) + return $ cmid <> closeDelim + +delimitedArgs :: PandocMonad m => MdocParser m x -> MdocParser m (Inlines, x, Inlines) +delimitedArgs p = do + openDelim <- openingDelimiters + inlines <- p + closeDelim <- closingDelimiters + return (openDelim, inlines, closeDelim) + +simpleInline :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines +simpleInline nm xform = do + macro nm + segs <- manyTill segment inlineContextEnd + spacify segs + where + segment = do + (openDelim, inlines, closeDelim) <- delimitedArgs $ option mempty litsToInlines + return $ openDelim <> xform inlines <> closeDelim + +codeLikeInline' :: PandocMonad m => T.Text -> T.Text -> MdocParser m Inlines +codeLikeInline' nm cl = simpleInline nm (eliminateEmpty (B.codeWith (cls cl) . stringify)) + +codeLikeInline :: PandocMonad m => T.Text -> MdocParser m Inlines +codeLikeInline nm = codeLikeInline' nm nm + +spanLikeInline :: PandocMonad m => T.Text -> MdocParser m Inlines +spanLikeInline nm = simpleInline nm (eliminateEmpty (B.spanWith (cls nm))) + +-- One-line enclosures need a little bit of state so that we don't parse +-- the closing delimiters that follow nested one-line or multiline +-- enclosures; the closing delimiters are meant to go after the close of +-- the outermost enclosure. Hence we respect and set inLineEnclosure. +lineEnclosure :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines +lineEnclosure nm xform = do + macro nm + amNested <- inLineEnclosure <$> getState + modifyState $ \s -> s{inLineEnclosure = True} + first <- openingDelimiters + further <- + (manyTill + (parseInlineMacro + <|> (try (litsAndDelimsToInlines <* notFollowedBy eol)) + <|> litsToInlines + <|> openingDelimiters) + lineEnclosureContextEnd) + further' <- spacify further + finally <- if amNested then mempty else closingDelimiters <* optional eol + modifyState $ \s -> s{inLineEnclosure = amNested} + return $ first <> xform further' <> finally + where + lineEnclosureContextEnd = + try $ + void (lookAhead (macro "Ta")) + <|> lookAhead (many (macro "Ns" <|> delim Close) *> eol) + + +-- The Ns, Ap, and Sm macros affect the automatic insertion of spaces between +-- macro arguments that occurs by default. We parse these macros to RawInlines +-- that we then eliminate in foldNoSpaces. If any of these macros end up +-- in the final AST returned by readMdoc, it's a bug. + +noSpace :: Inlines +noSpace = B.rawInline "mdoc" "Ns" + +apMacro :: Inlines +apMacro = B.rawInline "mdoc" "Ap" + +smOff :: Inlines +smOff = B.rawInline "mdoc" "Sm off" + +smOn :: Inlines +smOn = B.rawInline "mdoc" "Sm on" + +-- Accumulator for eliminating of Ns, Ap, and Sm macros from a list of 'Inlines' +data SpacifyState = SpacifyState + { accum :: [Inlines], -- already-folded 'Inlines' + prev :: Inlines, -- content we might be appending further content to + ns :: Bool, -- True when we've read an Ns and are waiting to concatenate content to prev + sm :: Bool -- True when spacing mode is on + } + +instance Default SpacifyState where + def = SpacifyState [] mempty False True + +-- Given a list of 'Inlines'es, concatenate consecutive elements that shouldn't +-- have a 'Space' inserted between them based on changes to the spacing mode, +-- Ap macros, and Ns macros. +foldNoSpaces :: [Inlines] -> [Inlines] +foldNoSpaces xs = (finalize . foldl go def) xs + where + go :: SpacifyState -> Inlines -> SpacifyState + go s x + | ns s && x == noSpace = s + | x == apMacro = s{prev = prev s <> "'", ns = True} + | x == noSpace = s{ns = True} + | x == smOn = s{sm = True} + | sm s && x == smOff = s{accum = accum s <> [prev s], prev = mempty, sm = False} + | ns s = s{prev = prev s <> x, ns = False} + | not (sm s) = s{prev = prev s <> x} + | null (prev s) = s{prev = x} + | otherwise = s{accum = accum s <> [prev s], prev = x} + finalize s + | null (prev s) = accum s + | otherwise = accum s <> [prev s] + +-- Add any necessary spaces between individual 'Inlines' in a list. +-- Respects the spacing mode status. This should more or less +-- always get applied to any list of 'Inlines' before doing anything +-- else with it. +spacify :: PandocMonad m => [Inlines] -> MdocParser m Inlines +spacify x = do + mode <- spacingMode <$> getState + return (go mode x) + where + go True = mconcat . intersperse B.space . foldNoSpaces + go False = mconcat . foldNoSpaces + +-- Compatibility note: mandoc permits, and doesn't warn on, "vertical" macros +-- (Pp, Bl/El, Bd/Ed) inside of "horizontal" block partial-explicit quotations +-- like Do/Dc. However there are no OpenBSD manual pages that employ such markup +-- and it doesn't look right when rendered. We don't attempt to consume anything +-- but pandoc inlines inside of these multiline enclosures. +multilineEnclosure :: PandocMonad m => T.Text -> T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines +multilineEnclosure op cl xform = do + macro op + amNested <- inLineEnclosure <$> getState + -- we're now "protected" from any outer enclosure or .It + modifyState $ \s -> s{inLineEnclosure = False} + openDelim <- mconcat <$> many (parseDelim Open) + optional eol + contents <- parseInlines + (macro cl show cl) + closeDelim <- + if amNested + then mempty + else mconcat <$> many (parseDelim Close) <* optional eol + modifyState $ \s -> s{inLineEnclosure = amNested} + return $ openDelim <> xform contents <> closeDelim + +parseEo :: PandocMonad m => MdocParser m Inlines +parseEo = do + macro "Eo" + odel <- del + optional eol + inner <- parseInlines + macro "Ec" + cdel <- del + optional eol + return $ odel <> inner <> cdel + where + del = B.str . toString <$> (arg <|> delim Open <|> delim Middle <|> delim Close) + +eliminateEmpty :: (Inlines -> Inlines) -> Inlines -> Inlines +eliminateEmpty x y = if null y then mempty else x y + +cls :: T.Text -> B.Attr +cls x = (mempty, [x], mempty) + +-- mandoc -T html formats Sy with a tag, since it's not really +-- semantically , but Strong is our best option in Pandoc +parseSy :: PandocMonad m => MdocParser m Inlines +parseSy = simpleInline "Sy" (eliminateEmpty B.strong) + +parseEm :: PandocMonad m => MdocParser m Inlines +parseEm = simpleInline "Em" (eliminateEmpty B.emph) + +parseNo :: PandocMonad m => MdocParser m Inlines +parseNo = simpleInline "No" (eliminateEmpty id) + +-- Deprecated, mandoc doesn't style this at all +parseTn :: PandocMonad m => MdocParser m Inlines +parseTn = simpleInline "Tn" (eliminateEmpty id) + +parseLi :: PandocMonad m => MdocParser m Inlines +parseLi = codeLikeInline "Li" + +parseEv :: PandocMonad m => MdocParser m Inlines +parseEv = codeLikeInline "Ev" + +parseDv :: PandocMonad m => MdocParser m Inlines +parseDv = codeLikeInline "Dv" + +parseAd :: PandocMonad m => MdocParser m Inlines +parseAd = spanLikeInline "Ad" + +parseVa :: PandocMonad m => MdocParser m Inlines +parseVa = codeLikeInline' "Va" "variable" + +parseVt :: PandocMonad m => MdocParser m Inlines +parseVt = codeLikeInline' "Vt" "variable" + +parseAn :: PandocMonad m => MdocParser m Inlines +parseAn = try anSplit <|> anRegular + where + anSplit = do + macro "An" + mode <- literal "-split" $> True <|> literal "-nosplit" $> False + modifyState $ \s -> s{authorNameSplit = mode} + return mempty + anRegular = do + an <- spanLikeInline "An" + spl <- authorNameSplit <$> getState + return $ (if spl then B.linebreak else mempty) <> an + +parseMs :: PandocMonad m => MdocParser m Inlines +parseMs = spanLikeInline "Ms" + +-- TODO implement internal reference links +parseSx :: PandocMonad m => MdocParser m Inlines +parseSx = spanLikeInline "Sx" + +-- I'm not sure why mandoc inserts a ~ when Mt is missing an argument, +-- but it does, and it doesn't issue a warning, so that quirk is +-- retained. +parseMt :: PandocMonad m => MdocParser m Inlines +parseMt = simpleInline "Mt" mailto + where mailto x | null x = B.link ("mailto:~") "" "~" + | otherwise = B.link ("mailto:" <> stringify x) "" x + +parsePa :: PandocMonad m => MdocParser m Inlines +parsePa = simpleInline "Pa" p + where p x | null x = B.spanWith (cls "Pa") "~" + | otherwise = B.spanWith (cls "Pa") x + +-- There's a number of unique-looking cases for Fl parsing so I am just +-- handling them very explicitly instead of trying to generalize anything +-- enough to handle it. Could conceivably be better. +parseFl :: PandocMonad m => MdocParser m Inlines +parseFl = do + macro "Fl" + start <- option mempty (emptyWithDelim <|> flfl <|> emptyWithMacro <|> emptyEmpty) + segs <- manyTill segment inlineContextEnd + spacify ([start] <> segs) + where + emptyWithDelim = do + lookAhead $ many1 (delim Middle <|> delim Close) + ds <- closingDelimiters + return $ fl "-" <> ds + flfl = do + lookAhead (macro "Fl") + x:xs <- B.toList <$> parseFl + let xx = B.codeWith (cls "Fl") $ "-" <> stringify x + return $ xx <> B.fromList xs + emptyWithMacro = do + lookAhead anyMacro + rest <- parseInline + return $ fl "-" <> rest + emptyEmpty = lookAhead eol $> fl "-" + segment = do + (openDelim, inlines, closeDelim) <- delimitedArgs $ option mempty litsToText + inner <- (spacify . (map fl) . flags) inlines + return $ openDelim <> inner <> closeDelim + fl = B.codeWith (cls "Fl") + flags [] = ["-"] + flags xs = map ("-" <>) xs + +parseAr :: PandocMonad m => MdocParser m Inlines +parseAr = simpleInline "Ar" ar + where ar x | null x = B.codeWith (cls "variable") "file ..." + | otherwise = B.codeWith (cls "variable") $ stringify x + + +parseCm :: PandocMonad m => MdocParser m Inlines +parseCm = codeLikeInline "Cm" + +parseIc :: PandocMonad m => MdocParser m Inlines +parseIc = codeLikeInline "Ic" + +parseEr :: PandocMonad m => MdocParser m Inlines +parseEr = codeLikeInline "Er" + +parseCd :: PandocMonad m => MdocParser m Inlines +parseCd = codeLikeInline "Cd" + +parseQl :: PandocMonad m => MdocParser m Inlines +parseQl = lineEnclosure "Ql" $ B.codeWith (cls "Ql") . stringify + +parseDq :: PandocMonad m => MdocParser m Inlines +parseDq = lineEnclosure "Dq" B.doubleQuoted + +parseDo :: PandocMonad m => MdocParser m Inlines +parseDo = multilineEnclosure "Do" "Dc" B.doubleQuoted + +parseSq :: PandocMonad m => MdocParser m Inlines +parseSq = lineEnclosure "Sq" B.singleQuoted + +parseSo :: PandocMonad m => MdocParser m Inlines +parseSo = multilineEnclosure "So" "Sc" B.singleQuoted + +parseQq :: PandocMonad m => MdocParser m Inlines +parseQq = lineEnclosure "Qq" $ \x -> "\"" <> x <> "\"" + +parseQo :: PandocMonad m => MdocParser m Inlines +parseQo = multilineEnclosure "Qo" "Qc" $ \x -> "\"" <> x <> "\"" + +parsePq :: PandocMonad m => MdocParser m Inlines +parsePq = lineEnclosure "Pq" $ \x -> "(" <> x <> ")" + +parsePo :: PandocMonad m => MdocParser m Inlines +parsePo = multilineEnclosure "Po" "Pc" $ \x -> "(" <> x <> ")" + +parseBq :: PandocMonad m => MdocParser m Inlines +parseBq = lineEnclosure "Bq" $ \x -> "[" <> x <> "]" + +parseBo :: PandocMonad m => MdocParser m Inlines +parseBo = multilineEnclosure "Bo" "Bc" $ \x -> "[" <> x <> "]" + +-- For our purposes this probably behaves identically to Bq +-- in most circumstances but I might need to do something +-- special with it in SYNOPSIS +parseOp :: PandocMonad m => MdocParser m Inlines +parseOp = lineEnclosure "Op" $ \x -> "[" <> x <> "]" + +parseOo :: PandocMonad m => MdocParser m Inlines +parseOo = multilineEnclosure "Oo" "Oc" $ \x -> "[" <> x <> "]" + +parseBrq :: PandocMonad m => MdocParser m Inlines +parseBrq = lineEnclosure "Brq" $ \x -> "{" <> x <> "}" + +parseBro :: PandocMonad m => MdocParser m Inlines +parseBro = multilineEnclosure "Bro" "Brc" $ \x -> "{" <> x <> "}" + +parseAq :: PandocMonad m => MdocParser m Inlines +parseAq = lineEnclosure "Aq" $ \x -> "⟨" <> x <> "⟩" + +parseAo :: PandocMonad m => MdocParser m Inlines +parseAo = multilineEnclosure "Ao" "Ac" $ \x -> "⟨" <> x <> "⟩" + +parseDl :: PandocMonad m => MdocParser m Blocks +parseDl = do + inner <- lineEnclosure "Dl" id + return $ B.codeBlock (stringify inner) + +parseD1 :: PandocMonad m => MdocParser m Blocks +parseD1 = do + inner <- lineEnclosure "D1" id + return $ B.divWith (cls "display") $ B.plain inner + +parseNm :: PandocMonad m => MdocParser m Inlines +parseNm = do + macro "Nm" + mnm <- (progName <$> getState) + (op, rg, cl) <- delimitedArgs $ option mempty litsToInlines + return $ case (mnm, rg) of + (Just nm, x) | null x -> + op <> ok nm <> cl + (_, x) -> + op <> (ok . stringify) x <> cl + where + ok = B.codeWith (cls "Nm") + + +parseXr :: PandocMonad m => MdocParser m Inlines +parseXr = do + macro "Xr" + (open, (name, section), close) <- delimitedArgs f + let ref = name <> "(" <> section <> ")" + return $ open <> B.spanWith (cls "Xr") (B.str ref) <> close + where + f = do + n <- lit "Xr manual name" + s <- lit "Xr manual section" + return (toString n, toString s) + +parseIn :: PandocMonad m => MdocParser m Inlines +parseIn = do + macro "In" + openClose <- closingDelimiters + openOpen <- openingDelimiters + header <- toString <$> lit + close <- closingDelimiters + return $ open openClose openOpen <> B.codeWith (cls "In") ("<" <> header <> ">") <> close + where + open a b + | null a = b + | null b = a + | otherwise = a <> B.space <> b + +parseFd :: PandocMonad m => MdocParser m Inlines +parseFd = codeLikeInline "Fd" + +parseFt :: PandocMonad m => MdocParser m Inlines +parseFt = codeLikeInline' "Ft" "variable" + +-- The output here is comparable to mandoc's HTML output, which doesn't tag +-- the commas/parentheses. Is this questionable from a pandoc POV? +formatFunction :: T.Text -> [Inlines] -> Inlines +formatFunction nm args = B.codeWith (cls "Fn") nm <> "(" <> args' <> ")" + where + args' = mconcat $ intersperse (", ") args + +parseFn :: PandocMonad m => MdocParser m Inlines +parseFn = do + macro "Fn" + (op, (nm, args), cl) <- delimitedArgs f + return $ op <> formatFunction nm (fmap (B.codeWith (cls "variable")) args) <> cl + where + f = do + nm <- toString <$> lit + args <- option [] litsToText + return (nm, args) + +parseFa :: PandocMonad m => MdocParser m Inlines +parseFa = codeLikeInline' "Fa" "variable" + +parseFo :: PandocMonad m => MdocParser m Inlines +parseFo = do + macro "Fo" + nm <- toString <$> lit + eol + args <- many (parseFa <* eol) + macro "Fc" + return $ formatFunction nm args + +parseLk :: PandocMonad m => MdocParser m Inlines +parseLk = do + macro "Lk" + openClose <- closingDelimiters + openOpen <- openingDelimiters + url <- toString <$> lit + inner <- many segment >>= spacify + close <- closingDelimiters + let label | null inner = B.str url + | otherwise = inner + return $ open openClose openOpen <> B.link url "" label <> close + where + open a b + | null a = b + | null b = a + | otherwise = a <> B.space <> b + end = msatisfy newControlContext + segment = do + a <- openingDelimiters + m <- option mempty litsToInlines + z <- + try (closingDelimiters <* notFollowedBy end) + <|> option mempty pipes + guard $ not $ all null [a, m, z] + return $ a <> m <> z + +-- This is a raw roff request but it appears sometimes in mdoc +-- manuals and is easy enough to handle +parsebr :: PandocMonad m => MdocParser m Inlines +parsebr = emptyMacro "br" >> return B.linebreak + +parseNs :: PandocMonad m => MdocParser m Inlines +parseNs = macro "Ns" >> return noSpace + +-- Per mdoc(7), Pf prefix macro [argument ...] is equivalent to +-- No \&prefix Ns macro [argument ...] and because of the way +-- spacify works, the easiest thing to do is just push an Ns onto +-- the input +parsePf :: PandocMonad m => MdocParser m Inlines +parsePf = do + macro "Pf" + t <- toString <$> anyToken + rest <- getInput + pos <- getPosition + setInput $ (Macro "Ns" pos):rest + return $ B.str t + +parseAp :: PandocMonad m => MdocParser m Inlines +parseAp = macro "Ap" >> return apMacro + +parseEx :: PandocMonad m => MdocParser m Inlines +parseEx = do + macro "Ex" + literal "-std" + args <- fmap toString <$> many lit + pn <- progName <$> getState + eol + return $ "The" + <> B.space + <> utils pn args + <> B.space + <> "0 on success, and >0 if an error occurs." + where + nm = B.codeWith (cls "Nm") + sing = "utility exits" + plur = "utilities exit" + utils (Just x) [] = nm x <> B.space <> sing + utils _ [x] = nm x <> B.space <> sing + utils _ [x,y] = nm x <> B.space <> "and" <> B.space <> nm y <> B.space <> plur + utils pn xs = + case (pn, unsnoc xs) of + (Nothing, Nothing) -> sing + (_, Just (hd, end)) -> mconcat ((intersperse (", ") . fmap nm) hd) <> ", and " <> nm end <> B.space <> plur + (Just p, Nothing) -> nm p <> B.space <> sing + + +parseRv :: (PandocMonad m) => MdocParser m Inlines +parseRv = do + macro "Rv" + literal "-std" + args <- fmap toString <$> many lit + pn <- progName <$> getState + eol + return $ go pn args + where + nm a = B.codeWith (cls "Fn") a <> "()" + nothing = "Upon successful completion, the value 0 is returned;" + sing = "function returns" + plur = "functions return" + success = "the value 0 if successful;" + errno = + "otherwise the value -1 is returned and the global variable" + <> B.codeWith (cls "variable") "errno" + <> "is set to indicate the error." + message conj = + "The" + <> B.space + <> conj + <> B.space + <> success + <> B.space + <> errno + go (Just x) [] = message (nm x <> B.space <> sing) + go _ [x] = message (nm x <> B.space <> sing) + go _ [x, y] = message (nm x <> B.space <> "and" <> B.space <> nm y <> B.space <> plur) + go pn xs = + case (pn, unsnoc xs) of + (Nothing, Nothing) -> nothing <> B.space <> errno + (_, Just (hd, end)) -> message (mconcat ((intersperse (", ") . fmap nm) hd) <> ", and " <> nm end <> B.space <> plur) + (Just p, Nothing) -> message (nm p <> B.space <> sing) + +parseSt :: PandocMonad m => MdocParser m Inlines +parseSt = do + macro "St" + (Lit std pos) <- lit + case standard std of + Nothing -> do + logMessage $ SkippedContent ("unrecognized argument to St: " <> std) pos + return mempty + Just t -> return $ B.text t + +-- TODO incorporate well-known library description and linker options +-- from mandoc lib.in expected in FreeBSD LIBRARY section, at minimum. +parseLb :: PandocMonad m => MdocParser m Inlines +parseLb = do + macro "Lb" + library <- toString <$> lit + return $ "library" <> B.space <> B.doubleQuoted (B.str library) + +unixVersion :: PandocMonad m => T.Text -> T.Text -> MdocParser m Inlines +unixVersion m s = do + macro m + (o, v, c) <- delimitedArgs (option mempty (toString <$> lit)) + return $ o <> B.str s <> f v <> c + where + f v | T.null v = mempty + | otherwise = B.space <> B.str v + +parseAt :: PandocMonad m => MdocParser m Inlines +parseAt = do + macro "At" + (o, v, c) <- delimitedArgs (optionMaybe (toString <$> lit)) + let v' = maybe "AT&T UNIX" attVer v + return $ o <> B.text v' <> c + where + isVersion x = x `elem` ["1", "2", "3", "4", "5", "6", "7"] + isRelease x = x `elem` ["1", "2", "3", "4"] + attVer (T.stripPrefix "v" -> Just ver) + | isVersion ver = "Version " <> ver <> " AT&T UNIX" + attVer "32v" = "Version 7 AT&T UNIX/32V" + attVer "III" = "AT&T System III UNIX" + attVer (T.stripPrefix "V." -> Just release) + | isRelease release = "AT&T System V Release " <> release <> " UNIX" + attVer "V" = "AT&T System V UNIX" + attVer x = "AT&T UNIX " <> x + +parseBsx :: PandocMonad m => MdocParser m Inlines +parseBsx = unixVersion "Bsx" "BSD/OS" + +parseBx :: PandocMonad m => MdocParser m Inlines +parseBx = do + macro "Bx" + (o, v, c) <- delimitedArgs zeroToTwoLits + return $ o <> bsd v <> c + where + zeroToTwoLits = do + toks <- try (count 2 lit) <|> count 1 lit <|> count 0 lit + return $ toString <$> toks + bsd [] = B.str "BSD" + bsd [x] = B.str $ x <> "BSD" + bsd (x:y:_) = B.str (x <> "BSD" <> "-" <> T.toTitle y) + +parseDx :: PandocMonad m => MdocParser m Inlines +parseDx = unixVersion "Dx" "DragonFly" + +parseFx :: PandocMonad m => MdocParser m Inlines +parseFx = unixVersion "Fx" "FreeBSD" + +-- This dance to capitalize a letter at the end of a NetBSD +-- version matches what mandoc does to the argument of .Nx. +-- See mandoc mdoc_validate.c r1.350 +-- Curiously, there's little easy-to-find evidence of what +-- these lettered releases actually are, other than +-- references in man page history sections to 0.9A etc. +parseNx :: PandocMonad m => MdocParser m Inlines +parseNx = do + macro "Nx" + (o, v, c) <- delimitedArgs (option mempty (toString <$> lit)) + return $ o <> "NetBSD" <> f v <> c + where + f v | T.null v = mempty + | otherwise = B.space <> B.str (fromRight v $ readWith earlyNetBSDVersion () v) + earlyNetBSDVersion = do + major <- oneOf "01" + dot <- char '.' + minor <- digit + ltr <- satisfy isAsciiLower + return $ T.pack [major, dot, minor, toUpper ltr] + +parseOx :: PandocMonad m => MdocParser m Inlines +parseOx = unixVersion "Ox" "OpenBSD" + +parseUx :: PandocMonad m => MdocParser m Inlines +parseUx = macro "Ux" >> return (B.str "UNIX") + +parseInlineMacro :: PandocMonad m => MdocParser m Inlines +parseInlineMacro = choice (synopsisTopicMacros <> otherInlineMacros) "inline macro" + +-- These macros always start a new line in SYNOPSIS +synopsisTopicMacros :: PandocMonad m => [MdocParser m Inlines] +synopsisTopicMacros = + [parseNm, parseCd, parseFd, parseFn, parseFo, parseIn, parseVt, parseFt] + +otherInlineMacros :: PandocMonad m => [MdocParser m Inlines] +otherInlineMacros = + [ parseSy, + parseEm, + parseLk, + parseLi, + parseEv, + parseDv, + parseMt, + parsePa, + parseFl, + parseCm, + parseIc, + parseEr, + parseAd, + parseVa, + parseAn, + parseMs, + parseSx, + parseAr, + parseFa, + parseNo, + parseTn, + parseXr, + parseQl, + parseOp, + parseSq, + parseDq, + parseQq, + parsePq, + parseBq, + parseBrq, + parseAq, + parseEo, + parseSo, + parseDo, + parseQo, + parsePo, + parseBo, + parseBro, + parseAo, + parseOo, + parseBf, + parseRsInline, + parseEx, + parseRv, + parseSt, + parseLb, + parseAt, + parseBsx, + parseBx, + parseDx, + parseFx, + parseNx, + parseOx, + parseUx, + parsebr, + parseAp, + parsePf, + parseNs, + skipUnsupportedInlines + ] + +parseInline :: PandocMonad m => MdocParser m Inlines +parseInline = parseStrs <|> (controlLine >>= spacify) "text lines or inline macros" + where + safeEol = do + amNested <- inLineEnclosure <$> getState + unless amNested $ optional eol + controlLine = many1 ((parseInlineMacro <|> litsAndDelimsToInlines) <* safeEol) + +parseInlines :: PandocMonad m => MdocParser m Inlines +parseInlines = many1 (parseSmToggle <|> parseInline) >>= spacify + +-- Lp is a deprecated synonym for Pp +parsePara :: PandocMonad m => MdocParser m Blocks +parsePara = B.para . B.trimInlines <$> parseInlines <* + optional (emptyMacro "Pp" <|> emptyMacro "Lp") + +-- Indented display blocks are visually similar to block quotes +-- but rarely carry those semantics. I'm just putting things in +-- divs. Centered is discouraged and rarely seen. +parseDisplay :: PandocMonad m => MdocParser m Blocks +parseDisplay = do + literal "-filled" <|> literal "-ragged" <|> literal "-centered" + many $ (literal "-offset" *> lit) <|> (literal "-compact") + eol + B.divWith (cls "display") . mconcat <$> many parseRegularBlock + +-- This is something of a best-effort interpretation of the -unfilled +-- display block type. The main difference with mandoc is probably +-- that newlines inside of multiline enclosures won't be preserved. +parseUnfilled :: PandocMonad m => MdocParser m Blocks +parseUnfilled = do + literal "-unfilled" + many $ (literal "-offset" *> lit) <|> (literal "-compact") + eol + lns <- many $ Just <$> parseStrPreserveSpace + <|> Nothing <$ parseSmToggle + <|> Just <$> parseInline + <|> Just "" <$ emptyMacro "Pp" + return $ B.lineBlock (catMaybes lns) + where + parseStrPreserveSpace = (B.str . toString) <$> str <|> (blank *> mempty) + +parseCodeBlock :: PandocMonad m => MdocParser m Blocks +parseCodeBlock = do + literal "-literal" + many $ (literal "-offset" *> lit) <|> (literal "-compact") + eol + lns <- many $ Just . toString <$> (str <|> blank) + <|> Nothing <$ parseSmToggle + <|> Just . stringify <$> parseInline + <|> Just "" <$ emptyMacro "Pp" + return $ B.codeBlock (T.unlines (catMaybes lns)) + +parseBd :: PandocMonad m => MdocParser m Blocks +parseBd = do + macro "Bd" + blk <- parseCodeBlock <|> parseDisplay <|> parseUnfilled + emptyMacro "Ed" + return blk + +-- This is a bit of a best effort version. Hypothetically multiple blocks +-- could occur inside a Bf and this should be a stateful thing but I don't +-- know if that's observed in the wild. +parseBf :: PandocMonad m => MdocParser m Inlines +parseBf = do + macro "Bf" + xform <- B.strong <$ (literal "Sy" <|> literal "-symbolic") + <|> B.emph <$ (literal "Em" <|> literal "-emphasis") + <|> code <$ (literal "Li" <|> literal "-literal") + eol + ins <- parseInlines + emptyMacro "Ef" + return $ xform ins + where + code = B.code . stringify + +skipListArgument :: (PandocMonad m) => MdocParser m () +skipListArgument = + void $ choice + [ literal "-width" *> lit, + literal "-offset" *> lit, + literal "-compact" + ] + +parseItemList :: PandocMonad m => MdocParser m Blocks +parseItemList = do + f <- (choice (map literal ["-bullet", "-dash", "-hyphen", "-item"]) $> B.bulletList) + <|> literal "-enum" $> B.orderedList + many skipListArgument + eol + items <- many bulletItem + return $ f items + where + bulletItem = do + emptyMacro "It" + mconcat <$> many parseRegularBlock + +-- Despite some ambiguous documentation to the contrary the Xo/Xc macros +-- only seem genuinely useful in an .It head, and it's not clear what if +-- anything it means to use them somewhere else in a contemporary mdoc manual. +-- See https://inbox.vuxu.org/mandoc-discuss/2UKLZW0DL8BSM.2IIO9W4HSUSRR@silby.fyi/T/ +-- for more blathering. +parseDefinitionList :: PandocMonad m => MdocParser m Blocks +parseDefinitionList = do + headParser <- (choice . map literal) ["-hang", "-inset", "-ohang", "-tag"] $> parsedHead <|> literal "-diag" $> diagHead + many skipListArgument + eol + items <- many (parseSmToggle *> mempty <|> dlItem headParser) + return $ B.definitionList items + where + parsedHead = try xoListHead <|> eolListHead + eolListHead = do + modifyState $ \s -> s{inLineEnclosure = True} + inner <- parseInlines + eol + modifyState $ \s -> s{inLineEnclosure = False} + return inner + diagHead = argsToInlines + dlItem hed = do + -- Some manuals have an evidently useless .Pp before .It + -- e.g. OpenBSD ld(1), just deal with it. + many ((void . emptyMacro) "Pp" <|> skipUnsupportedMacro "Tg") + macro "It" + dt <- hed + dd <- mconcat <$> many parseRegularBlock + return (dt, [dd]) + xoListHead = do + before <- option mempty parseInline + macro "Xo" + optional eol + after <- many1Till parseInlines (emptyMacro "Xc") + spacify (before:after) + +-- TODO support implicit rows: +-- If the first line of the body of a -column list is not an It macro line, +-- It contexts spanning one input line each are implied until an It macro +-- line is encountered +-- and support literal tabs +parseColumnList :: PandocMonad m => MdocParser m Blocks +parseColumnList = do + literal "-column" + many skipListArgument + many $ arg <|> delim Open <|> delim Middle <|> delim Close + eol + rows <- many listRow + return $ B.simpleTable [] rows + where + listRow = do + optional (emptyMacro "Pp") + macro "It" + fmap B.plain <$> sepBy (parseInlines <|> pure mempty) (macro "Ta" <* optional eol) + +parseBl :: PandocMonad m => MdocParser m Blocks +parseBl = do + macro "Bl" + blk <- parseItemList <|> parseDefinitionList <|> parseColumnList + emptyMacro "El" + return blk + +referenceField :: PandocMonad m => T.Text -> ReferenceField -> MdocParser m () +referenceField m field = do + macro m + reference <- currentReference <$> getState + contents <- stringify <$> litsAndDelimsToInlines + eol + modifyState $ \s -> s{currentReference = M.insertWith (++) field [contents] reference} + return () + +parsePercentA :: PandocMonad m => MdocParser m () +parsePercentA = referenceField "%A" Author + +parsePercentB :: PandocMonad m => MdocParser m () +parsePercentB = referenceField "%B" BookTitle + +parsePercentC :: PandocMonad m => MdocParser m () +parsePercentC = referenceField "%C" PubLocation + +parsePercentD :: PandocMonad m => MdocParser m () +parsePercentD = referenceField "%D" PubDate + +parsePercentI :: PandocMonad m => MdocParser m () +parsePercentI = referenceField "%I" Publisher + +parsePercentJ :: PandocMonad m => MdocParser m () +parsePercentJ = referenceField "%J" Journal + +parsePercentN :: PandocMonad m => MdocParser m () +parsePercentN = referenceField "%N" IssueNumber + +parsePercentO :: PandocMonad m => MdocParser m () +parsePercentO = referenceField "%O" Optional + +parsePercentP :: PandocMonad m => MdocParser m () +parsePercentP = referenceField "%P" Pages + +parsePercentQ :: PandocMonad m => MdocParser m () +parsePercentQ = referenceField "%Q" Institution + +parsePercentR :: PandocMonad m => MdocParser m () +parsePercentR = referenceField "%R" TechReportTitle + +parsePercentT :: PandocMonad m => MdocParser m () +parsePercentT = referenceField "%T" ArticleTitle + +parsePercentU :: PandocMonad m => MdocParser m () +parsePercentU = referenceField "%U" Url + +parsePercentV :: PandocMonad m => MdocParser m () +parsePercentV = referenceField "%V" VolumeNumber + +parseReferenceField :: PandocMonad m => MdocParser m () +parseReferenceField = + choice [ + parsePercentA, + parsePercentB, + parsePercentC, + parsePercentD, + parsePercentI, + parsePercentJ, + parsePercentN, + parsePercentO, + parsePercentP, + parsePercentQ, + parsePercentR, + parsePercentT, + parsePercentU, + parsePercentV + ] + +parseRsInline :: PandocMonad m => MdocParser m Inlines +parseRsInline = do + sec <- currentSection <$> getState + guard $ sec /= ShSeeAlso + parseRs + +parseRs :: PandocMonad m => MdocParser m Inlines +parseRs = do + emptyMacro "Rs" + modifyState $ \s -> s{currentReference = M.empty} + many1 parseReferenceField + emptyMacro "Re" + ref <- currentReference <$> getState + -- TODO formatting fields correctly + return $ B.text $ (M.foldl f mempty ref) <> "." + where join v = T.concat (intersperse ", " v) + f a v | T.null a = join v + | otherwise = a <> ", " <> join v + +-- mandoc's roff(7) says "Blank text lines, which may include whitespace, +-- are only permitted within literal contexts." mandoc -T lint warns about +-- blank lines and inserts a roff `sp` request, which is handled +-- differently depending on the output format. My read is that mandoc +-- considers the handling of a blank line in non-literal context in mdoc(7) +-- to be undefined. The Mdoc reader thus ignores blank input lines outside +-- of -literal and -unfilled displays. +skipBlanks :: PandocMonad m => MdocParser m Blocks +skipBlanks = many1 blank *> mempty + +-- By default, mdoc is in "spacing mode", where horizontal space is added +-- between macro contents. The Sm macro turns it off and on. When we encounter +-- the Sm macro, we both modify the parser state and we emit a sentinel value +-- that spacify/foldNoSpaces uses to handle cases where spacing mode gets +-- turned off and on within a stretch of inlines. +parseSmToggle :: PandocMonad m => MdocParser m Inlines +parseSmToggle = do + macro "Sm" + cur <- spacingMode <$> getState + mode <- optionMaybe (literal "on" $> True <|> literal "off" $> False) + eol + let newMode = update mode cur + modifyState $ \s -> s{spacingMode = newMode} + return $ if newMode then smOn else smOff + where + update = \case + Nothing -> not + Just x -> const x + +skipUnsupportedMacro :: PandocMonad m => T.Text -> MdocParser m () +skipUnsupportedMacro nm = do + (Macro _ pos) <- macro nm + manyTill anyToken eol + logMessage $ SkippedContent ("unsupported macro: " <> nm) pos + + +skipUnsupportedInlines :: PandocMonad m => MdocParser m Inlines +skipUnsupportedInlines = choice + [ skipUnsupportedMacro "Tg", + skipUnsupportedMacro "Bk", + skipUnsupportedMacro "Ek" + ] *> mempty + +skipUnknownMacro :: PandocMonad m => MdocParser m Blocks +skipUnknownMacro = do + pos <- getPosition + m <- anyMacro + manyTill anyToken eol + logMessage $ SkippedContent ("unsupported macro: " <> toString m) pos + return mempty + + +parseRegularBlock :: PandocMonad m => MdocParser m Blocks +parseRegularBlock = + choice + [ parseDl + , parseD1 + , parsePara + , emptyMacro "Pp" *> mempty + , parseBd + , parseBl + , skipBlanks + ] + +parseBlock :: (PandocMonad m) => MdocParser m Blocks +parseBlock = + choice + [ parseHeader + , parseNameSection + , parseSynopsisSection + , parseSeeAlsoSection + , parseMiniSynopsis + , parseRegularBlock + , skipUnknownMacro + ] diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs new file mode 100644 index 000000000000..d000b996c80f --- /dev/null +++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs @@ -0,0 +1,225 @@ +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE MultiParamTypeClasses #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE TypeFamilies #-} +{- | + Module : Text.Pandoc.Readers.Mdoc.Lex + Copyright : Copyright (C) 2024 Evan Silberman + License : GNU GPL, version 2 or above + + Maintainer : Evan Silberman + Stability : WIP + Portability : portable + +Tokenizer for mdoc +-} +module Text.Pandoc.Readers.Mdoc.Lex + ( MdocToken(..) + , MdocTokens(..) + , DelimSide(..) + , lexMdoc + , toString + ) +where + +import Control.Monad (void, guard, when) +import Control.Monad.Except (throwError) +import Text.Pandoc.Class.PandocMonad (PandocMonad(..)) +import Data.Char (isAlphaNum) +import Data.Maybe (isJust) +import qualified Data.Text as T +import Text.Pandoc.Options +import Text.Pandoc.Parsing +import Text.Pandoc.Readers.Roff.Escape +import Text.Pandoc.Readers.Mdoc.Macros +import qualified Data.Sequence as Seq + +-- As a higher level language with a wealth of semantic macros, mdoc +-- discourages authors from falling back to low-level roff features like font +-- selection, custom macros, defined strings, etc. Pandoc's mdoc reader is +-- accordingly implemented as a high-level interpreter of mdoc's semantic macros +-- and almost no raw roff requests are supported. +-- +-- tbl(7) and eqn(7) macros are rare but not completely unseen in mdoc manuals. +-- they are not yet implemented. most use of tbl macros in mdoc could probably +-- be replaced with .Bl -column + +data DelimSide = Open | Middle | Close deriving (Show, Eq) + +-- | Tokens for Mdoc documents +data MdocToken = Str T.Text SourcePos -- ^ The contents of a text line + | Macro T.Text SourcePos -- ^ A macro to be processed + | Lit T.Text SourcePos -- ^ Literal text on a control line + | Blank SourcePos -- ^ A blank line + | Delim DelimSide T.Text SourcePos -- ^ A delimiter character + | Eol -- ^ The end of a control line + deriving Show + +toString :: MdocToken -> T.Text +toString (Str x _) = x +toString (Macro x _) = x +toString (Lit x _) = x +toString (Delim _ x _) = x +toString Blank{} = mempty +toString Eol = mempty + +newtype MdocTokens = MdocTokens { unMdocTokens :: Seq.Seq MdocToken } + deriving (Show, Semigroup, Monoid) + +singleTok :: MdocToken -> MdocTokens +singleTok t = MdocTokens (Seq.singleton t) + +type Lexer m = ParsecT Sources () m + +instance RoffLikeLexer MdocTokens where + -- This is a bit confusing. We're lexing to MdocTokens, but for escaping + -- purposes we just want Texts. + type Token MdocTokens = T.Text + -- We don't need a state + type State MdocTokens = () + -- We don't support predefined string expansion + expandString = return () + escString = return mempty + -- what token type the unescaped text gets wrapped in is decided by other + -- parts of the lexer. + emit = id + -- All escapes are resolved in the lexer and we never need to emit anything, + -- vs. the roff lexer which has to push the backlashes to the output while + -- in copy mode. + backslash = (mempty <* char '\\') <|> (mempty <* string "\\E") + -- We don't support macro definition and we don't output anything for \A + checkDefined = const mempty + -- We don't support copy mode and \E is treated as backslash + escE = return mempty + -- We don't support low-level font selection + escFont = escIgnore 'f' [escapeArg, countChar 1 (satisfy (/='\n'))] + +eofline :: (Stream s m Char, UpdateSourcePos s Char) => ParsecT s u m MdocToken +eofline = do + void newline <|> eof + return Eol + +lexComment :: PandocMonad m => Lexer m MdocTokens +lexComment = do + try $ string ".\\\"" + skipMany $ noneOf "\n" + eofline + return mempty + +argText :: PandocMonad m => Lexer m T.Text +argText = do + beg <- escape <|> regularText + end <- mconcat <$> many (escape <|> regularText <|> quoteChar) + return $ beg <> end + +spaceTabChar :: PandocMonad m => Lexer m T.Text +spaceTabChar = T.singleton <$> spaceChar + +quotedArg :: PandocMonad m => Lexer m T.Text +quotedArg = do + quoteChar + t <- mconcat <$> many (try innerQuote <|> escape <|> regularText <|> spaceTabChar) + quoteChar + notFollowedBy quoteChar + return t + where + innerQuote = do + string "\"\"" + return "\"" + +anyText :: PandocMonad m => Lexer m T.Text +anyText = escape <|> regularText <|> quoteChar <|> spaceTabChar + +regularText :: PandocMonad m => Lexer m T.Text +regularText = many1Char $ noneOf "\n\r\t \\\"" + +quoteChar :: PandocMonad m => Lexer m T.Text +quoteChar = T.singleton <$> char '"' + +mdocToken :: PandocMonad m => Lexer m MdocTokens +mdocToken = lexComment <|> lexControlLine <|> lexTextLine + +lexMacroName :: PandocMonad m => Lexer m T.Text +lexMacroName = many1Char (satisfy isMacroChar) + where + isMacroChar '%' = True + isMacroChar x = isAlphaNum x + +lexMacro :: PandocMonad m => Lexer m MdocToken +lexMacro = do + pos <- getPosition + name <- lexMacroName + eof <|> void (lookAhead (spaceChar <|> newline)) + skipSpaces + return $ Macro name pos + +lexCallableMacro :: PandocMonad m => Lexer m MdocToken +lexCallableMacro = do + pos <- getPosition + q <- optionMaybe quoteChar + name <- lexMacroName + when (isJust q) (void quoteChar) + eof <|> void (lookAhead (spaceChar <|> newline)) + skipSpaces + guard $ isCallableMacro name + return $ Macro name pos + +lexDelim :: (PandocMonad m) => Lexer m MdocToken +lexDelim = do + pos <- getPosition + q <- optionMaybe quoteChar + t <- + Delim Open <$> oneOfStrings ["(", "["] + <|> Delim Close <$> oneOfStrings [".", ",", ":", ";", ")", "]", "?", "!"] + <|> Delim Middle <$> textStr "|" + when (isJust q) (void quoteChar) + eof <|> void (lookAhead (spaceChar <|> newline)) + skipSpaces + return $ t pos + +lexLit :: PandocMonad m => Lexer m MdocToken +lexLit = do + pos <- getPosition + t <- argText <|> quotedArg + skipSpaces + return $ Lit t pos + +lexTextLine :: PandocMonad m => Lexer m MdocTokens +lexTextLine = do + pos <- getPosition + guard $ sourceColumn pos == 1 + t <- mconcat <$> many anyText + eofline + if T.null $ T.strip t + then return $ singleTok $ Blank pos + else return $ singleTok $ Str t pos + +lexControlLine :: PandocMonad m => Lexer m MdocTokens +lexControlLine = do + pos <- getPosition + guard $ sourceColumn pos == 1 + char '.' + eofline *> mempty <|> do + m@(Macro name _) <- lexMacro + -- .Ns macros at the start of a line are ignored. We'd have to look behind + -- to keep track of the "start of the line" in the parser, so we'll drop + -- those macros in lexing. + let start | name == "Ns" = [] + | otherwise = [m] + let parsed = isParsedMacro name + (wds, e) <- manyUntil (l parsed) eofline + return $ MdocTokens $ Seq.fromList $ start <> wds <> [e] + where + l True = try lexDelim <|> try lexCallableMacro <|> lexLit + l False = try lexDelim <|> lexLit + +-- | Tokenize a string as a sequence of mdoc tokens. +lexMdoc :: PandocMonad m => SourcePos -> T.Text -> m MdocTokens +lexMdoc pos txt = do + eithertokens <- readWithM (do setPosition pos + mconcat <$> manyTill mdocToken eof) def txt + case eithertokens of + Left e -> throwError e + Right tokenz -> return tokenz diff --git a/src/Text/Pandoc/Readers/Mdoc/Macros.hs b/src/Text/Pandoc/Readers/Mdoc/Macros.hs new file mode 100644 index 000000000000..798da0246b55 --- /dev/null +++ b/src/Text/Pandoc/Readers/Mdoc/Macros.hs @@ -0,0 +1,183 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Readers.Mdoc.Macros + Copyright : + License : GNU GPL, version 2 or above + + Maintainer : Evan Silberman + Stability : WIP + Portability : portable + +-} +module Text.Pandoc.Readers.Mdoc.Macros (isParsedMacro, isCallableMacro) where + +import Data.Set (member, fromList, Set) +import Data.Text + +isParsedMacro :: Text -> Bool +isParsedMacro a = member a parsedMacros + +isCallableMacro :: Text -> Bool +isCallableMacro a = member a callableMacros + +parsedMacros :: Set Text +parsedMacros = fromList [ + "Ac", + "Ad", + "An", + "Ao", + "Ap", + "Aq", + "Ar", + "At", + "Bc", + "Bo", + "Bq", + "Brc", + "Bro", + "Brq", + "Bsx", + "Bx", + "Cd", + "Cm", + "D1", + "Dc", + "Dl", + "Do", + "Dq", + "Dv", + "Dx", + "Ec", + "Em", + "En", + "Eo", + "Er", + "Es", + "Ev", + "Fa", + "Fc", + "Fl", + "Fn", + "Fr", + "Ft", + "Fx", + "Ic", + "In", + "It", + "Li", + "Lk", + "Ms", + "Mt", + "Nm", + "No", + "Ns", + "Nx", + "Oc", + "Oo", + "Op", + "Ot", + "Ox", + "Pa", + "Pc", + "Pf", + "Po", + "Pq", + "Qc", + "Ql", + "Qo", + "Qq", + "Sc", + "Sh", + "So", + "Sq", + "Ss", + "St", + "Sx", + "Sy", + "Ta", + "Tn", + "Ux", + "Va", + "Vt", + "Xc", + "Xo", + "Xr"] + +callableMacros :: Set Text +callableMacros = fromList [ + "Ac", + "Ad", + "An", + "Ao", + "Ap", + "Aq", + "Ar", + "At", + "Bc", + "Bo", + "Bq", + "Brc", + "Bro", + "Brq", + "Bsx", + "Bx", + "Cd", + "Cm", + "Dc", + "Do", + "Dq", + "Dv", + "Dx", + "Ec", + "Em", + "En", + "Eo", + "Er", + "Es", + "Ev", + "Fa", + "Fc", + "Fl", + "Fn", + "Fo", + "Fr", + "Ft", + "Fx", + "Ic", + "In", + "Li", + "Lk", + "Ms", + "Mt", + "Nm", + "No", + "Ns", + "Nx", + "Oc", + "Oo", + "Op", + "Ot", + "Ox", + "Pa", + "Pc", + "Pf", + "Po", + "Pq", + "Qc", + "Ql", + "Qo", + "Qq", + "Sc", + "So", + "Sq", + "St", + "Sx", + "Sy", + "Ta", + "Tn", + "Ux", + "Va", + "Vt", + "Xc", + "Xo", + "Xr"] diff --git a/src/Text/Pandoc/Readers/Mdoc/Standards.hs b/src/Text/Pandoc/Readers/Mdoc/Standards.hs new file mode 100644 index 000000000000..475eaf763d77 --- /dev/null +++ b/src/Text/Pandoc/Readers/Mdoc/Standards.hs @@ -0,0 +1,69 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Readers.Mdoc.Standards + Copyright : + License : GNU GPL, version 2 or above + + Maintainer : Evan Silberman + Stability : WIP + Portability : portable + +-} +module Text.Pandoc.Readers.Mdoc.Standards (standard) where + +import Data.Map (fromList, Map) +import qualified Data.Map as M +import Data.Text + +standard :: Text -> Maybe Text +standard = flip M.lookup standards + +standards :: Map Text Text +standards = fromList [ + ("-p1003.1-88", "IEEE Std 1003.1-1988 (“POSIX.1”)"), + ("-p1003.1-90", "IEEE Std 1003.1-1990 (“POSIX.1”)"), + ("-p1003.1-96", "ISO/IEC 9945-1:1996 (“POSIX.1”)"), + ("-p1003.1-2001", "IEEE Std 1003.1-2001 (“POSIX.1”)"), + ("-p1003.1-2004", "IEEE Std 1003.1-2004 (“POSIX.1”)"), + ("-p1003.1-2008", "IEEE Std 1003.1-2008 (“POSIX.1”)"), + ("-p1003.1-2024", "IEEE Std 1003.1-2024 (“POSIX.1”)"), + ("-p1003.1", "IEEE Std 1003.1 (“POSIX.1”)"), + ("-p1003.1b", "IEEE Std 1003.1b (“POSIX.1b”)"), + ("-p1003.1b-93", "IEEE Std 1003.1b-1993 (“POSIX.1b”)"), + ("-p1003.1c-95", "IEEE Std 1003.1c-1995 (“POSIX.1c”)"), + ("-p1003.1g-2000", "IEEE Std 1003.1g-2000 (“POSIX.1g”)"), + ("-p1003.1i-95", "IEEE Std 1003.1i-1995 (“POSIX.1i”)"), + ("-p1003.2", "IEEE Std 1003.2 (“POSIX.2”)"), + ("-p1003.2-92", "IEEE Std 1003.2-1992 (“POSIX.2”)"), + ("-p1003.2a-92", "IEEE Std 1003.2a-1992 (“POSIX.2”)"), + ("-isoC", "ISO/IEC 9899:1990 (“ISO C90”)"), + ("-isoC-90", "ISO/IEC 9899:1990 (“ISO C90”)"), + ("-isoC-amd1", "ISO/IEC 9899/AMD1:1995 (“ISO C90, Amendment 1”)"), + ("-isoC-tcor1", "ISO/IEC 9899/TCOR1:1994 (“ISO C90, Technical Corrigendum 1”)"), + ("-isoC-tcor2", "ISO/IEC 9899/TCOR2:1995 (“ISO C90, Technical Corrigendum 2”)"), + ("-isoC-99", "ISO/IEC 9899:1999 (“ISO C99”)"), + ("-isoC-2011", "ISO/IEC 9899:2011 (“ISO C11”)"), + ("-iso9945-1-90", "ISO/IEC 9945-1:1990 (“POSIX.1”)"), + ("-iso9945-1-96", "ISO/IEC 9945-1:1996 (“POSIX.1”)"), + ("-iso9945-2-93", "ISO/IEC 9945-2:1993 (“POSIX.2”)"), + ("-ansiC", "ANSI X3.159-1989 (“ANSI C89”)"), + ("-ansiC-89", "ANSI X3.159-1989 (“ANSI C89”)"), + ("-ieee754", "IEEE Std 754-1985"), + ("-iso8802-3", "ISO 8802-3: 1989"), + ("-iso8601", "ISO 8601"), + ("-ieee1275-94", "IEEE Std 1275-1994 (“Open Firmware”)"), + ("-xpg3", "X/Open Portability Guide Issue 3 (“XPG3”)"), + ("-xpg4", "X/Open Portability Guide Issue 4 (“XPG4”)"), + ("-xpg4.2", "X/Open Portability Guide Issue 4, Version 2 (“XPG4.2”)"), + ("-xbd5", "X/Open Base Definitions Issue 5 (“XBD5”)"), + ("-xcu5", "X/Open Commands and Utilities Issue 5 (“XCU5”)"), + ("-xsh5", "X/Open System Interfaces and Headers Issue 5 (“XSH5”)"), + ("-xns5", "X/Open Networking Services Issue 5 (“XNS5”)"), + ("-xns5.2", "X/Open Networking Services Issue 5.2 (“XNS5.2”)"), + ("-xcurses4.2", "X/Open Curses Issue 4, Version 2 (“XCURSES4.2”)"), + ("-susv1", "Version 1 of the Single UNIX Specification (“SUSv1”)"), + ("-susv2", "Version 2 of the Single UNIX Specification (“SUSv2”)"), + ("-susv3", "Version 3 of the Single UNIX Specification (“SUSv3”)"), + ("-susv4", "Version 4 of the Single UNIX Specification (“SUSv4”)"), + ("-svid4", "System V Interface Definition, Fourth Edition (“SVID4”)") + ] diff --git a/src/Text/Pandoc/Readers/Roff/Escape.hs b/src/Text/Pandoc/Readers/Roff/Escape.hs index f797ab165155..245acf1af6ea 100644 --- a/src/Text/Pandoc/Readers/Roff/Escape.hs +++ b/src/Text/Pandoc/Readers/Roff/Escape.hs @@ -5,6 +5,7 @@ module Text.Pandoc.Readers.Roff.Escape ( escape, escapeArg, + escIgnore, RoffLikeLexer(..), ) where @@ -50,8 +51,7 @@ class (Monoid (Token x)) => RoffLikeLexer x where backslash :: PandocMonad m => Lexer m x () -- | If the given custom macro is defined in this document, emit a -- tokenized "1", otherwise emit a tokenized "0", implementing the roff - -- escape @\A@. The default case assumes an implementation that doesn't - -- support macro definition and always emits null. + -- escape @\A@. checkDefined :: PandocMonad m => T.Text -> Lexer m x (Token x) -- | Output an appropriate token for the @\E@ escape sequence. In roff, @\E@ -- is an "escape character intended to not be interpreted in copy mode". diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs new file mode 100644 index 000000000000..32b8ae36e9db --- /dev/null +++ b/test/Tests/Readers/Mdoc.hs @@ -0,0 +1,321 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Tests.Readers.Mdoc + Copyright : © 2024 Evan Silberman + License : GNU GPL, version 2 or above + + Maintainer : + Stability : alpha + Portability : portable + +Tests for the Mdoc reader. +-} + +module Tests.Readers.Mdoc (tests) where + +import Data.Text (Text) +import qualified Data.Text as T +import Test.Tasty +import Test.Tasty.HUnit (HasCallStack) +import Tests.Helpers +import Text.Pandoc +import Text.Pandoc.Arbitrary () +import Text.Pandoc.Builder + +mdoc :: Text -> Pandoc +mdoc = purely $ readMdoc def + +infix 4 =: +(=:) :: (ToString c, HasCallStack) + => String -> (Text, c) -> TestTree +(=:) = test mdoc + +cls :: Text -> Attr +cls x = (mempty, [x], mempty) + +tests :: [TestTree] +tests = [ + testGroup "one-line enclosures" + [ "Dq" =: + ".Dq hello world" =?> + para (doubleQuoted "hello world") + , "Sq" =: + ".Sq hello world" =?> + para (singleQuoted "hello world") + , "empty" =: + ".Dq" =?> + para (doubleQuoted mempty) + , "nested" =: + ".Dq Pq hello world" =?> + para (doubleQuoted "(hello world)") + , "nested with closing delimiters" =: + ".Dq Pq hi mom !" =?> + para (doubleQuoted "(hi mom)" <> "!") + , "nested multiline enclosure" =: + ".Dq Po a \\&; b \\&; c Pc ." =?> + para (doubleQuoted "(a ; b ; c)" <> ".") + , "with inlines" =: + ".Dq hello Sy world ." =?> + para (doubleQuoted ("hello" <> space <> strong "world" <> ".")) + , "with text production" =: + ".Dq I love the St -iso8601 standard!" =?> + para (doubleQuoted "I love the ISO 8601 standard!") + , "with Ns" =: + ".Op , Ns Ar value ..." =?> + para ("[," <> (codeWith (cls "variable") "value ...") <> "]") + , "ending with open delimiter" =: + ".Dq hi (" =?> + para (doubleQuoted "hi (") + ] + , testGroup "multiline enclosures" + [ "nested multiline" =: + T.unlines [".Bo", ".Po", "hi", ".Pc", ".Bc"] =?> + para ("[(hi)]") + , "nested on one line" =: + ".Bo Po hi Pc Bc" =?> + para ("[(hi)]") + , "with wacky delimiters" =: + ".Bo ( | hi ! Bc ?" =?> + para ("([| hi!]?") + ] + , testGroup "simple inlines" + [ "Sy" =: + ".Sy hello world" =?> + para (strong "hello world") + , "Em" =: + ".Em hello world" =?> + para (emph "hello world") + , "Ev" =: + ".Ev HELLO_WORLD ," =?> + para (codeWith (cls "Ev") "HELLO_WORLD" <> ",") + , "Ar" =: + ".Ar ) z" =?> + para (codeWith (cls "variable") "file ..." <> ") " <> codeWith (cls "variable") "z") + , "In" =: + ".In ( math.h ) b c" =?> + para ("(" <> codeWith (cls "In") "" <> ") b c") + , "Mt" =: + ".Mt a@example.org , b@example.org" =?> + para ((link "mailto:a@example.org" "" "a@example.org") <> + "," <> space <> (link "mailto:b@example.org" "" "b@example.org")) + , "No" =: + ".No ( hello , world ! )" =?> + para "(hello, world!)" + , "empty Pa with closing punctuation" =: + ".Pa ) z" =?> + para (spanWith (cls "Pa") "~" <> ")" <> space <> spanWith (mempty, ["Pa"], mempty) "z") + , "delimiters" =: + ".Sy ( hello world )" =?> + para (mconcat ["(", strong "hello world", ")"]) + , "multiple" =: + ".Sy hello Em world" =?> + para (strong "hello" <> space <> emph "world") + ] + , testGroup "Fl" + [ "simple" =: + ".Fl w" =?> + para (codeWith (cls "Fl") "-w") + , "multiple" =: + ".Fl W all" =?> + para (codeWith (cls "Fl") "-W" <> space <> codeWith (cls "Fl") "-all") + , "empty with following macro" =: + ".Fl Cm x" =?> + para (codeWith (cls "Fl") "-" <> codeWith (cls "Cm") "x") + , "enclosed with following macro" =: + ".Pq Fl Cm x" =?> + para ("(" <> codeWith (cls "Fl") "-" <> codeWith (cls "Cm") "x" <> ")") + -- XXX this is a mandoc delta, the period is meant to land outside + -- the enclosure. parseInline has learned how to defer eols but not + -- closing delimiters + , "enclosed with following delimiters" =: + ".Pq Fl x ." =?> + para ("(" <> codeWith (cls "Fl") "-x" <> ".)") + , "following Ns" =: + ".Fl W Ns Cm all" =?> + para (codeWith (cls "Fl") "-W" <> codeWith (cls "Cm") "all") + , "GNU" =: + ".Fl -help" =?> + para (codeWith (cls "Fl") "--help") + , "GNU escaped" =: + ".Fl \\-help" =?> + para (codeWith (cls "Fl") "--help") + , "GNU Fl Fl" =: + ".Fl Fl help" =?> + para (codeWith (cls "Fl") "--help") + , "punctuation" =: + ".Op Fl a | b" =?> + para ("[" <> codeWith (cls "Fl") "-a" <> " | " <> codeWith (cls "Fl") "-b" <> "]") + , "middle close paren" =: + ".Fl a ) z" =?> + para (codeWith (cls "Fl") "-a" <> ") " <> codeWith (cls "Fl") "-z") + , "empty with close paren" =: + ".Fl ) z" =?> + para (codeWith (cls "Fl") "-" <> ") " <> codeWith (cls "Fl") "-z") + , "empty with pipe" =: + ".Fl | z" =?> + para (codeWith (cls "Fl") "-" <> " | " <> codeWith (cls "Fl") "-z") + , "empty with parens" =: + ".Fl ( )" =?> + para ("(" <> codeWith (cls "Fl") "-" <> ")") + ] + , testGroup "links" + [ "basic" =: + ".Lk href name" =?> + para (link "href" "" "name") + , "complicated" =: + ".Lk , ( href name )" =?> + para ("," <> space <> "(" <> link "href" "" "name" <> ")") + , "unnamed" =: + ".Lk href" =?> + para (link "href" "" "href") + ] + , testGroup "Ns macro" + [ "at the beginning of a macro line (mandoc delta)" =: + T.unlines [".Op before", ".Ns Op after"] =?> + para "[before] [after]" -- mandoc: warning + , "after a block closing macro" =: + T.unlines [".Oo before", ".Oc Ns Op after"] =?> + para "[before][after]" + , "in the middle of a macro line" =: + ".Oo before Oc Ns Op after" =?> + para "[before][after]" + , "before closing punctuation" =: + ".Oo before Oc Ns : Op after" =?> + para "[before]: [after]" -- mandoc: warning + , "after closing punctuation" =: + ".Oo before Oc : Ns Op after" =?> + para "[before]:[after]" + , "at the end of a macro line" =: + T.unlines [".Oo before Oc Ns", ".Op after"] =?> + para "[before][after]" + , "at the end of a partial-implicit line" =: + T.unlines [".Op before Ns", ".Op after"] =?> + para "[before][after]" + , "normal words" =: + ".No no Ns ns No no" =?> + para ("nons" <> space <> "no") + , "opening punctuation" =: + ".No no Ns \"(\" ns No no" =?> + para ("no(ns" <> space <> "no") + , "closing punctuation" =: + ".No no \"Ns\" ns \")\" No no" =?> + para ("nons)" <> space <> "no") + ] + , testGroup "spacing mode" + [ "all text" =: + T.unlines ["a", ".Sm off", "b c", "d", ".Sm on", "e"] =?> + para ("a b c d e") + , "text around macro" =: + T.unlines ["a", ".Sm off", ".Sy b c", ".Sm on", "d"] =?> + para ("a" <> space <> strong "bc" <> space <> "d") + , "mulitple macros" =: + T.unlines ["a", ".Sm off", ".Sy b Em c", ".Sm on", "d"] =?> + para ("a" <> space <> strong "b" <> emph "c" <> space <> "d") + , "mulitple control lines" =: + T.unlines ["a", ".Sm off", ".Sy b", ".Em c", ".Sm on", "d"] =?> + para ("a" <> space <> strong "b" <> emph "c" <> space <> "d") + , "mixed control and text lines" =: + T.unlines ["a", ".Sm off", ".Sy b", "c", ".Em d", ".Sm on", "d"] =?> + para ("a" <> space <> strong "b" <> "c" <> emph "d" <> space <> "d") + , "delimiters" =: + T.unlines [".Sm off", ".Em a ", ".Em [ b | c ]", ".Sm on"] =?> + para (emph "a" <> "[" <> emph "b" <> "|" <> emph "c" <> "]") + ] + , testGroup "Ap macro" + [ "in the middle of a macro line" =: + ".Xr mandoc 1 Ap s" =?> + para (spanWith (cls "Xr") "mandoc(1)" <> "'s") + -- mandoc difference: the edge case of "Ap (" tested in this mandoc regress + -- isn't present in any actual OpenBSD base system manuals, where Ap is + -- only ever followed by a letter. Furthermore, "Ap" is generally uncommon + -- compared to "Ns '" (e.g. ".Xr mandoc 1 Ns 's"). I'm accepting a + -- difference from mandoc here because correctly suppressing space after + -- the "(" here would require more refactoring than I feel like doing at + -- time of writing. + -- per mandoc, should be: para (strong "bold" <> "'(" <> strong "bold") + , "with punctuation and called macro" =: + ".Sy bold Ap ( \"Sy\" bold" =?> + para (strong "bold" <> "'( " <> strong "bold") + ] + , testGroup "Pf macro" + [ "closing punctuation" =: + ".Pf . right ." =?> + para ".right." + , "double punctuation" =: + ".Pf . . double" =?> + para ".. double" + , "opening punctuation" =: + ".Pf ( left ." =?> + para "(left." + , "unparsed argument" =: + ".Pf Ar Sy gument " =?> + para ("Ar" <> strong "gument") + ] + , testGroup "text production" + [ "early NetBSD versions" =: + T.unlines [".Nx 0.9a", "and", ".Nx 9.4b"] =?> + para "NetBSD 0.9A and NetBSD 9.4b" + , "with Ns" =: + ".Ox Ns -specific" =?> + para "OpenBSD-specific" + , "with punctuation" =: + ".Fx ," =?> + para "FreeBSD," + , "with argument and punctuation" =: + ".Fx 12.0 ." =?> + para "FreeBSD 12.0." + , "BSD alone" =: + ".Bx ." =?> + para "BSD." + , "BSD with macro" =: + ".Bx No rocks" =?> + para "BSD rocks" + , "BSD with version" =: + ".Bx 4.4 ," =?> + para "4.4BSD," + , "BSD with variant" =: + ".Bx 4.3 tahoe !" =?> + para "4.3BSD-Tahoe!" + ] + , testGroup "inline punctuation" + [ testGroup "leading punctuation" + [ "open paren" =: ".Em ( b" =?> para ("(" <> emph "b") + , "open square bracket" =: ".Em \"[\" b" =?> para ("[" <> emph "b") + , "pipe" =: ".Em | b" =?> para ("|" <> space <> emph "b") + , "period" =: ".Em . b" =?> para ("." <> space <> emph "b") + , "comma" =: ".Em , b" =?> para ("," <> space <> emph "b") + , "semicolon" =: ".Em ; b" =?> para (";" <> space <> emph "b") + , "colon" =: ".Em : b" =?> para (":" <> space <> emph "b") + , "question mark" =: ".Em ? b" =?> para ("?" <> space <> emph "b") + , "exclamation" =: ".Em ! b" =?> para ("!" <> space <> emph "b") + , "close paren" =: ".Em ) b" =?> para (")" <> space <> emph "b") + , "close square bracket" =: ".Em \"]\" b" =?> para ("]" <> space <> emph "b") + ] + , testGroup "trailing punctuation" + [ "open paren" =: ".Em a (" =?> para (emph "a" <> space <> "(") + , "open square bracket" =: ".Em a [" =?> para (emph "a" <> space <> "[") + , "pipe" =: ".Em a |" =?> para (emph "a" <> space <> "|") + , "period" =: ".Em a ." =?> para (emph "a" <> ".") + , "comma" =: ".Em a ," =?> para (emph "a" <> ",") + , "semicolon" =: ".Em a ;" =?> para (emph "a" <> ";") + , "colon" =: ".Em a :" =?> para (emph "a" <> ":") + , "question mark" =: ".Em a ?" =?> para (emph "a" <> "?") + , "exclamation" =: ".Em a !" =?> para (emph "a" <> "!") + , "close parens" =: ".Em a \")\"" =?> para (emph "a" <> ")") + , "close square bracket" =: ".Em a ]" =?> para (emph "a" <> "]") + ] + , testGroup "middle punctuation" + [ "open paren" =: ".Em a ( b" =?> para (mconcat [emph "a", space, "(", emph "b"]) + , "open square bracket" =: ".Em a [ b" =?> para (mconcat [emph "a", space, "[", emph "b"]) + , "pipe" =: ".Em a \"|\" b" =?> para (mconcat [emph "a", space, "|", space, emph "b"]) + , "period" =: ".Em a . b" =?> para (mconcat [emph "a", ".", space, emph "b"]) + , "comma" =: ".Em a , b" =?> para (mconcat [emph "a", ",", space, emph "b"]) + , "semicolon" =: ".Em a ; b" =?> para (mconcat [emph "a", ";", space, emph "b"]) + , "colon" =: ".Em a \":\" b" =?> para (mconcat [emph "a", ":", space, emph "b"]) + , "question mark" =: ".Em a ? b" =?> para (mconcat [emph "a", "?", space, emph "b"]) + , "exclamation" =: ".Em a ! b" =?> para (mconcat [emph "a", "!", space, emph "b"]) + , "close paren" =: ".Em a ) b" =?> para (mconcat [emph "a", ")", space, emph "b"]) + , "close square bracket" =: ".Em a ] b" =?> para (mconcat [emph "a", "]", space, emph "b"]) + ] + ] + ] diff --git a/test/command/mdoc-An.md b/test/command/mdoc-An.md new file mode 100644 index 000000000000..c219cdbd7300 --- /dev/null +++ b/test/command/mdoc-An.md @@ -0,0 +1,17 @@ +``` +% pandoc -f mdoc -t plain +.Sh DESCRIPTION +.An Emily Brontë , +.An Anne Brontë +.Pp +.An -split +.An Charlotte Brontë +.An Bramwell Brontë +^D +DESCRIPTION + +Emily Brontë, Anne Brontë + +Charlotte Brontë +Bramwell Brontë +``` diff --git a/test/command/mdoc-Bd-unfilled.md b/test/command/mdoc-Bd-unfilled.md new file mode 100644 index 000000000000..10a39ccc3e18 --- /dev/null +++ b/test/command/mdoc-Bd-unfilled.md @@ -0,0 +1,24 @@ +``` +% pandoc -f mdoc -t plain +.Bd -unfilled +’Twas brillig, and the slithy toves + Did gyre and gimble in the wabe: +All mimsy were the borogoves, + And the mome raths outgrabe. + +“Beware the Jabberwock, my son! + The jaws that bite, the claws that catch! +Beware the Jubjub bird, and shun + The frumious Bandersnatch!” +.Ed +^D +’Twas brillig, and the slithy toves + Did gyre and gimble in the wabe: +All mimsy were the borogoves, + And the mome raths outgrabe. + +“Beware the Jabberwock, my son! + The jaws that bite, the claws that catch! +Beware the Jubjub bird, and shun + The frumious Bandersnatch!” +``` diff --git a/test/command/mdoc-Bl-column.md b/test/command/mdoc-Bl-column.md new file mode 100644 index 000000000000..c460b03a4982 --- /dev/null +++ b/test/command/mdoc-Bl-column.md @@ -0,0 +1,22 @@ +``` +% pandoc -f mdoc -t djot +.Bl -column a b +.It a Ta +.Sy b +b +.Ta c +.Pp +.It c Ta d Ta \& +.It Em e +.Ta f +.Ta g +.It h Ta Ta j +.It Pq a Ta b Ta c +.El +^D +| a | *b* b | c | +| c | d | | +| _e_ | f | g | +| h | | j | +| (a) | b | c | +``` diff --git a/test/command/mdoc-Bl-tag.md b/test/command/mdoc-Bl-tag.md new file mode 100644 index 000000000000..5f5bd940f70c --- /dev/null +++ b/test/command/mdoc-Bl-tag.md @@ -0,0 +1,36 @@ +``` +% pandoc -f mdoc -t native +.Bl -tag +.It one +one +.It Xo +two +.Xc +two +.It Em three Xo +three +.Xc +three +.It Po one two +.Bq three +.Pc +four +.El +^D +[ DefinitionList + [ ( [ Str "one" ] , [ [ Para [ Str "one" ] ] ] ) + , ( [ Str "two" ] , [ [ Para [ Str "two" ] ] ] ) + , ( [ Emph [ Str "three" ] , Space , Str "three" ] + , [ [ Para [ Str "three" ] ] ] + ) + , ( [ Str "(one" + , Space + , Str "two" + , Space + , Str "[three])" + ] + , [ [ Para [ Str "four" ] ] ] + ) + ] +] +``` diff --git a/test/test-pandoc.hs b/test/test-pandoc.hs index d8c78276b3ec..6d749b84549f 100644 --- a/test/test-pandoc.hs +++ b/test/test-pandoc.hs @@ -29,6 +29,7 @@ import qualified Tests.Readers.RST import qualified Tests.Readers.RTF import qualified Tests.Readers.Txt2Tags import qualified Tests.Readers.Man +import qualified Tests.Readers.Mdoc import qualified Tests.Shared import qualified Tests.Writers.AsciiDoc import qualified Tests.Writers.ConTeXt @@ -97,6 +98,7 @@ tests pandocPath = testGroup "pandoc tests" , testGroup "Muse" Tests.Readers.Muse.tests , testGroup "Creole" Tests.Readers.Creole.tests , testGroup "Man" Tests.Readers.Man.tests + , testGroup "Mdoc" Tests.Readers.Mdoc.tests , testGroup "FB2" Tests.Readers.FB2.tests , testGroup "DokuWiki" Tests.Readers.DokuWiki.tests ]