Skip to content

Commit

Permalink
Docx reader: Support new table features.
Browse files Browse the repository at this point in the history
* Column spans
* Row spans
  - The spec says that if the `val` attribute is ommitted, its value
    should be assumed to be `continue`, and that its values are
    restricted to {`restart`, `continue`}. If the value has any other
    value, I think it seems reasonable to default it to `continue`. It
    might cause problems if the spec is extended in the future by adding
    a third possible value, in which case this would probably give
    incorrect behaviour, and wouldn't error.
* Allow multiple header rows
* Include table description in simple caption
  - The table description element is like alt text for a table (along
    with the table caption element). It seems like we should include
    this somewhere, but I’m not 100% sure how – I’m pairing it with the
    simple caption for the moment. (Should it maybe go in the block
    caption instead?)
* Detect table captions
  - Check for caption paragraph style /and/ either the simple or
    complex table field. This means the caption detection fails for
    captions which don’t contain a field, as in an example doc I added
    as a test. However, I think it’s better to be too conservative: a
    missed table caption will still show up as a paragraph next to the
    table, whereas if I incorrectly classify something else as a table
    caption it could cause havoc by pairing it up with a table it’s
    not at all related to, or dropping it entirely.
* Update tests and add new ones

Partially fixes: jgm#6316
  • Loading branch information
Emily Bourke authored and tarleb committed May 28, 2021
1 parent d4e0e5f commit 0cab1cb
Show file tree
Hide file tree
Showing 15 changed files with 487 additions and 68 deletions.
92 changes: 56 additions & 36 deletions src/Text/Pandoc/Readers/Docx.hs
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,14 @@ module Text.Pandoc.Readers.Docx
import Codec.Archive.Zip
import Control.Monad.Reader
import Control.Monad.State.Strict
import Data.Bifunctor (bimap, first)
import qualified Data.ByteString.Lazy as B
import Data.Default (Default)
import Data.List (delete, intersect)
import Data.List (delete, intersect, foldl')
import Data.Char (isSpace)
import qualified Data.Map as M
import qualified Data.Text as T
import Data.Maybe (isJust, fromMaybe)
import Data.Maybe (catMaybes, isJust, fromMaybe)
import Data.Sequence (ViewL (..), viewl)
import qualified Data.Sequence as Seq
import qualified Data.Set as Set
Expand Down Expand Up @@ -113,6 +114,7 @@ data DState = DState { docxAnchorMap :: M.Map T.Text T.Text
-- restarting
, docxListState :: M.Map (T.Text, T.Text) Integer
, docxPrevPara :: Inlines
, docxTableCaptions :: [Blocks]
}

instance Default DState where
Expand All @@ -123,6 +125,7 @@ instance Default DState where
, docxDropCap = mempty
, docxListState = M.empty
, docxPrevPara = mempty
, docxTableCaptions = []
}

data DEnv = DEnv { docxOptions :: ReaderOptions
Expand Down Expand Up @@ -491,15 +494,32 @@ singleParaToPlain blks
singleton $ Plain ils
singleParaToPlain blks = blks

cellToBlocks :: PandocMonad m => Docx.Cell -> DocxContext m Blocks
cellToBlocks (Docx.Cell bps) = do
cellToCell :: PandocMonad m => RowSpan -> Docx.Cell -> DocxContext m Pandoc.Cell
cellToCell rowSpan (Docx.Cell gridSpan _ bps) = do
blks <- smushBlocks <$> mapM bodyPartToBlocks bps
return $ fromList $ blocksToDefinitions $ blocksToBullets $ toList blks
let blks' = singleParaToPlain $ fromList $ blocksToDefinitions $ blocksToBullets $ toList blks
return (cell AlignDefault rowSpan (ColSpan (fromIntegral gridSpan)) blks')

rowsToRows :: PandocMonad m => [Docx.Row] -> DocxContext m [Pandoc.Row]
rowsToRows rows = do
let rowspans = (fmap . fmap) (first RowSpan) (Docx.rowsToRowspans rows)
cells <- traverse (traverse (uncurry cellToCell)) rowspans
return (fmap (Pandoc.Row nullAttr) cells)

splitHeaderRows :: Bool -> [Docx.Row] -> ([Docx.Row], [Docx.Row])
splitHeaderRows hasFirstRowFormatting rs = bimap reverse reverse $ fst
$ if hasFirstRowFormatting
then foldl' f ((take 1 rs, []), True) (drop 1 rs)
else foldl' f (([], []), False) rs
where
f ((headerRows, bodyRows), previousRowWasHeader) r@(Docx.Row h cs)
| h == HasTblHeader || (previousRowWasHeader && any isContinuationCell cs)
= ((r : headerRows, bodyRows), True)
| otherwise
= ((headerRows, r : bodyRows), False)

isContinuationCell (Docx.Cell _ vm _) = vm == Docx.Continue

rowToBlocksList :: PandocMonad m => Docx.Row -> DocxContext m [Blocks]
rowToBlocksList (Docx.Row cells) = do
blksList <- mapM cellToBlocks cells
return $ map singleParaToPlain blksList

-- like trimInlines, but also take out linebreaks
trimSps :: Inlines -> Inlines
Expand Down Expand Up @@ -546,6 +566,11 @@ normalizeToClassName = T.map go . fromStyleName
where go c | isSpace c = '-'
| otherwise = c

bodyPartToTableCaption :: PandocMonad m => BodyPart -> DocxContext m (Maybe Blocks)
bodyPartToTableCaption (TblCaption pPr parparts) =
Just <$> bodyPartToBlocks (Paragraph pPr parparts)
bodyPartToTableCaption _ = pure Nothing

bodyPartToBlocks :: PandocMonad m => BodyPart -> DocxContext m Blocks
bodyPartToBlocks (Paragraph pPr parparts)
| Just True <- pBidi pPr = do
Expand Down Expand Up @@ -637,50 +662,43 @@ bodyPartToBlocks (ListItem pPr _ _ _ parparts) =
let pPr' = pPr {pStyle = constructBogusParStyleData "list-paragraph": pStyle pPr}
in
bodyPartToBlocks $ Paragraph pPr' parparts
bodyPartToBlocks (TblCaption _ _) =
return $ para mempty -- collected separately
bodyPartToBlocks (Tbl _ _ _ []) =
return $ para mempty
bodyPartToBlocks (Tbl cap grid look parts@(r:rs)) = do
let cap' = simpleCaption $ plain $ text cap
(hdr, rows) = case firstRowFormatting look of
True | null rs -> (Nothing, [r])
| otherwise -> (Just r, rs)
False -> (Nothing, r:rs)

cells <- mapM rowToBlocksList rows
bodyPartToBlocks (Tbl cap grid look parts) = do
captions <- gets docxTableCaptions
fullCaption <- case captions of
c : cs -> do
modify (\s -> s { docxTableCaptions = cs })
return c
[] -> return $ if T.null cap then mempty else plain (text cap)
let shortCaption = if T.null cap then Nothing else Just (toList (text cap))
cap' = caption shortCaption fullCaption
(hdr, rows) = splitHeaderRows (firstRowFormatting look) parts

let width = maybe 0 maximum $ nonEmpty $ map rowLength parts
rowLength :: Docx.Row -> Int
rowLength (Docx.Row c) = length c
rowLength (Docx.Row _ c) = sum (fmap (\(Docx.Cell gridSpan _ _) -> fromIntegral gridSpan) c)

let toRow = Pandoc.Row nullAttr . map simpleCell
toHeaderRow l = [toRow l | not (null l)]
headerCells <- rowsToRows hdr
bodyCells <- rowsToRows rows

-- pad cells. New Text.Pandoc.Builder will do that for us,
-- so this is for compatibility while we switch over.
let cells' = map (\row -> toRow $ take width (row ++ repeat mempty)) cells

hdrCells <- case hdr of
Just r' -> toHeaderRow <$> rowToBlocksList r'
Nothing -> return []

-- The two following variables (horizontal column alignment and
-- relative column widths) go to the default at the
-- moment. Width information is in the TblGrid field of the Tbl,
-- so should be possible. Alignment might be more difficult,
-- since there doesn't seem to be a column entity in docx.
-- Horizontal column alignment goes to the default at the moment. Getting
-- it might be difficult, since there doesn't seem to be a column entity
-- in docx.
let alignments = replicate width AlignDefault
totalWidth = sum grid
widths = (\w -> ColWidth (fromInteger w / fromInteger totalWidth)) <$> grid

return $ table cap'
(zip alignments widths)
(TableHead nullAttr hdrCells)
[TableBody nullAttr 0 [] cells']
(TableHead nullAttr headerCells)
[TableBody nullAttr 0 [] bodyCells]
(TableFoot nullAttr [])
bodyPartToBlocks (OMathPara e) =
return $ para $ displayMath (writeTeX e)


-- replace targets with generated anchors.
rewriteLink' :: PandocMonad m => Inline -> DocxContext m Inline
rewriteLink' l@(Link attr ils (T.uncons -> Just ('#',target), title)) = do
Expand Down Expand Up @@ -716,6 +734,8 @@ bodyToOutput :: PandocMonad m => Body -> DocxContext m (Meta, [Block])
bodyToOutput (Body bps) = do
let (metabps, blkbps) = sepBodyParts bps
meta <- bodyPartsToMeta metabps
captions <- catMaybes <$> mapM bodyPartToTableCaption blkbps
modify (\s -> s { docxTableCaptions = captions })
blks <- smushBlocks <$> mapM bodyPartToBlocks blkbps
blks' <- rewriteLinks $ blocksToDefinitions $ blocksToBullets $ toList blks
blks'' <- removeOrphanAnchors blks'
Expand Down
113 changes: 100 additions & 13 deletions src/Text/Pandoc/Readers/Docx/Parse.hs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..)
, ParStyle
, CharStyle(cStyleData)
, Row(..)
, TblHeader(..)
, Cell(..)
, VMerge(..)
, TrackedChange(..)
, ChangeType(..)
, ChangeInfo(..)
Expand All @@ -50,6 +52,7 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..)
, pHeading
, constructBogusParStyleData
, leftBiasedMergeRunStyle
, rowsToRowspans
) where
import Text.Pandoc.Readers.Docx.Parse.Styles
import Codec.Archive.Zip
Expand Down Expand Up @@ -225,6 +228,7 @@ defaultParagraphStyle = ParagraphStyle { pStyle = []
data BodyPart = Paragraph ParagraphStyle [ParPart]
| ListItem ParagraphStyle T.Text T.Text (Maybe Level) [ParPart]
| Tbl T.Text TblGrid TblLook [Row]
| TblCaption ParagraphStyle [ParPart]
| OMathPara [Exp]
deriving Show

Expand All @@ -236,12 +240,61 @@ newtype TblLook = TblLook {firstRowFormatting::Bool}
defaultTblLook :: TblLook
defaultTblLook = TblLook{firstRowFormatting = False}

newtype Row = Row [Cell]
deriving Show
data Row = Row TblHeader [Cell] deriving Show

data TblHeader = HasTblHeader | NoTblHeader deriving (Show, Eq)

newtype Cell = Cell [BodyPart]
data Cell = Cell GridSpan VMerge [BodyPart]
deriving Show

type GridSpan = Integer

data VMerge = Continue
-- ^ This cell should be merged with the one above it
| Restart
-- ^ This cell should not be merged with the one above it
deriving (Show, Eq)

rowsToRowspans :: [Row] -> [[(Int, Cell)]]
rowsToRowspans rows = let
removeMergedCells = fmap (filter (\(_, Cell _ vmerge _) -> vmerge == Restart))
in removeMergedCells (foldr f [] rows)
where
f :: Row -> [[(Int, Cell)]] -> [[(Int, Cell)]]
f (Row _ cells) acc = let
spans = g cells Nothing (listToMaybe acc)
in spans : acc

g ::
-- | The current row
[Cell] ->
-- | Number of columns left below
Maybe Integer ->
-- | (rowspan so far, cell) for the row below this one
Maybe [(Int, Cell)] ->
-- | (rowspan so far, cell) for this row
[(Int, Cell)]
g cells _ Nothing = zip (repeat 1) cells
g cells columnsLeftBelow (Just rowBelow) =
case cells of
[] -> []
thisCell@(Cell thisGridSpan _ _) : restOfRow -> case rowBelow of
[] -> zip (repeat 1) cells
(spanSoFarBelow, Cell gridSpanBelow vmerge _) : _ ->
let spanSoFar = case vmerge of
Restart -> 1
Continue -> 1 + spanSoFarBelow
columnsToDrop = thisGridSpan + (gridSpanBelow - fromMaybe gridSpanBelow columnsLeftBelow)
(newColumnsLeftBelow, restOfRowBelow) = dropColumns columnsToDrop rowBelow
in (spanSoFar, thisCell) : g restOfRow (Just newColumnsLeftBelow) (Just restOfRowBelow)

dropColumns :: Integer -> [(a, Cell)] -> (Integer, [(a, Cell)])
dropColumns n [] = (n, [])
dropColumns n cells@((_, Cell gridSpan _ _) : otherCells) =
if n < gridSpan
then (gridSpan - n, cells)
else dropColumns (n - gridSpan) otherCells

leftBiasedMergeRunStyle :: RunStyle -> RunStyle -> RunStyle
leftBiasedMergeRunStyle a b = RunStyle
{ isBold = isBold a <|> isBold b
Expand Down Expand Up @@ -587,14 +640,31 @@ elemToRow ns element | isElem ns "w" "tr" element =
do
let cellElems = findChildrenByName ns "w" "tc" element
cells <- mapD (elemToCell ns) cellElems
return $ Row cells
let hasTblHeader = maybe NoTblHeader (const HasTblHeader)
(findChildByName ns "w" "trPr" element
>>= findChildByName ns "w" "tblHeader")
return $ Row hasTblHeader cells
elemToRow _ _ = throwError WrongElem

elemToCell :: NameSpaces -> Element -> D Cell
elemToCell ns element | isElem ns "w" "tc" element =
do
let properties = findChildByName ns "w" "tcPr" element
let gridSpan = properties
>>= findChildByName ns "w" "gridSpan"
>>= findAttrByName ns "w" "val"
>>= stringToInteger
let vMerge = case properties >>= findChildByName ns "w" "vMerge" of
Nothing -> Restart
Just e ->
fromMaybe Continue $ do
s <- findAttrByName ns "w" "val" e
case s of
"continue" -> Just Continue
"restart" -> Just Restart
_ -> Nothing
cellContents <- mapD (elemToBodyPart ns) (elChildren element)
return $ Cell cellContents
return $ Cell (fromMaybe 1 gridSpan) vMerge cellContents
elemToCell _ _ = throwError WrongElem

elemToParIndentation :: NameSpaces -> Element -> Maybe ParIndentation
Expand Down Expand Up @@ -626,10 +696,9 @@ pNumInfo = getParStyleField numInfo . pStyle
elemToBodyPart :: NameSpaces -> Element -> D BodyPart
elemToBodyPart ns element
| isElem ns "w" "p" element
, (c:_) <- findChildrenByName ns "m" "oMathPara" element =
do
expsLst <- eitherToD $ readOMML $ showElement c
return $ OMathPara expsLst
, (c:_) <- findChildrenByName ns "m" "oMathPara" element = do
expsLst <- eitherToD $ readOMML $ showElement c
return $ OMathPara expsLst
elemToBodyPart ns element
| isElem ns "w" "p" element
, Just (numId, lvl) <- getNumInfo ns element = do
Expand All @@ -647,13 +716,31 @@ elemToBodyPart ns element
Nothing | Just (numId, lvl) <- pNumInfo parstyle -> do
levelInfo <- lookupLevel numId lvl <$> asks envNumbering
return $ ListItem parstyle numId lvl levelInfo parparts
_ -> return $ Paragraph parstyle parparts
_ -> let
hasCaptionStyle = elem "Caption" (pStyleId <$> pStyle parstyle)

hasSimpleTableField = fromMaybe False $ do
fldSimple <- findChildByName ns "w" "fldSimple" element
instr <- findAttrByName ns "w" "instr" fldSimple
pure ("Table" `elem` T.words instr)

hasComplexTableField = fromMaybe False $ do
instrText <- findElementByName ns "w" "instrText" element
pure ("Table" `elem` T.words (strContent instrText))

in if hasCaptionStyle && (hasSimpleTableField || hasComplexTableField)
then return $ TblCaption parstyle parparts
else return $ Paragraph parstyle parparts

elemToBodyPart ns element
| isElem ns "w" "tbl" element = do
let caption' = findChildByName ns "w" "tblPr" element
let tblProperties = findChildByName ns "w" "tblPr" element
caption = fromMaybe "" $ tblProperties
>>= findChildByName ns "w" "tblCaption"
>>= findAttrByName ns "w" "val"
caption = fromMaybe "" caption'
description = fromMaybe "" $ tblProperties
>>= findChildByName ns "w" "tblDescription"
>>= findAttrByName ns "w" "val"
grid' = case findChildByName ns "w" "tblGrid" element of
Just g -> elemToTblGrid ns g
Nothing -> return []
Expand All @@ -666,7 +753,7 @@ elemToBodyPart ns element
grid <- grid'
tblLook <- tblLook'
rows <- mapD (elemToRow ns) (elChildren element)
return $ Tbl caption grid tblLook rows
return $ Tbl (caption <> description) grid tblLook rows
elemToBodyPart _ _ = throwError WrongElem

lookupRelationship :: DocumentLocation -> RelId -> [Relationship] -> Maybe Target
Expand Down
7 changes: 7 additions & 0 deletions src/Text/Pandoc/Readers/Docx/Util.hs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ module Text.Pandoc.Readers.Docx.Util (
, elemToNameSpaces
, findChildByName
, findChildrenByName
, findElementByName
, findAttrByName
) where

Expand Down Expand Up @@ -56,6 +57,12 @@ findChildrenByName ns pref name el =
let ns' = ns <> elemToNameSpaces el
in findChildren (elemName ns' pref name) el

-- | Like 'findChildrenByName', but searches descendants.
findElementByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element
findElementByName ns pref name el =
let ns' = ns <> elemToNameSpaces el
in findElement (elemName ns' pref name) el

findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text
findAttrByName ns pref name el =
let ns' = ns <> elemToNameSpaces el
Expand Down
16 changes: 16 additions & 0 deletions test/Tests/Readers/Docx.hs
Original file line number Diff line number Diff line change
Expand Up @@ -317,14 +317,30 @@ tests = [ testGroup "document"
"tables with lists in cells"
"docx/table_with_list_cell.docx"
"docx/table_with_list_cell.native"
, testCompare
"a table with a header which contains rowspans greater than 1"
"docx/table_header_rowspan.docx"
"docx/table_header_rowspan.native"
, testCompare
"tables with one row"
"docx/table_one_row.docx"
"docx/table_one_row.native"
, testCompare
"tables with just one row, which is a header"
"docx/table_one_header_row.docx"
"docx/table_one_header_row.native"
, testCompare
"tables with variable width"
"docx/table_variable_width.docx"
"docx/table_variable_width.native"
, testCompare
"tables with captions which contain a Table field"
"docx/table_captions_with_field.docx"
"docx/table_captions_with_field.native"
, testCompare
"tables with captions which don't contain a Table field"
"docx/table_captions_no_field.docx"
"docx/table_captions_no_field.native"
, testCompare
"code block"
"docx/codeblock.docx"
Expand Down
Loading

0 comments on commit 0cab1cb

Please sign in to comment.