Skip to content

Commit 3f1c7f0

Browse files
Emily Bourketarleb
Emily Bourke
authored andcommitted
Docx reader: Support new table features.
* Column spans * Row spans - The spec says that if the `val` attribute is ommitted, its value should be assumed to be `continue`, and that its values are restricted to {`restart`, `continue`}. If the value has any other value, I think it seems reasonable to default it to `continue`. It might cause problems if the spec is extended in the future by adding a third possible value, in which case this would probably give incorrect behaviour, and wouldn't error. * Allow multiple header rows * Include table description in simple caption - The table description element is like alt text for a table (along with the table caption element). It seems like we should include this somewhere, but I’m not 100% sure how – I’m pairing it with the simple caption for the moment. (Should it maybe go in the block caption instead?) * Detect table captions - Check for caption paragraph style /and/ either the simple or complex table field. This means the caption detection fails for captions which don’t contain a field, as in an example doc I added as a test. However, I think it’s better to be too conservative: a missed table caption will still show up as a paragraph next to the table, whereas if I incorrectly classify something else as a table caption it could cause havoc by pairing it up with a table it’s not at all related to, or dropping it entirely. * Update tests and add new ones Partially fixes: jgm#6316
1 parent d4e0e5f commit 3f1c7f0

15 files changed

+487
-67
lines changed

Diff for: src/Text/Pandoc/Readers/Docx.hs

+56-36
Original file line numberDiff line numberDiff line change
@@ -61,13 +61,14 @@ module Text.Pandoc.Readers.Docx
6161
import Codec.Archive.Zip
6262
import Control.Monad.Reader
6363
import Control.Monad.State.Strict
64+
import Data.Bifunctor (bimap, first)
6465
import qualified Data.ByteString.Lazy as B
6566
import Data.Default (Default)
66-
import Data.List (delete, intersect)
67+
import Data.List (delete, intersect, foldl')
6768
import Data.Char (isSpace)
6869
import qualified Data.Map as M
6970
import qualified Data.Text as T
70-
import Data.Maybe (isJust, fromMaybe)
71+
import Data.Maybe (catMaybes, isJust, fromMaybe)
7172
import Data.Sequence (ViewL (..), viewl)
7273
import qualified Data.Sequence as Seq
7374
import qualified Data.Set as Set
@@ -113,6 +114,7 @@ data DState = DState { docxAnchorMap :: M.Map T.Text T.Text
113114
-- restarting
114115
, docxListState :: M.Map (T.Text, T.Text) Integer
115116
, docxPrevPara :: Inlines
117+
, docxTableCaptions :: [Blocks]
116118
}
117119

118120
instance Default DState where
@@ -123,6 +125,7 @@ instance Default DState where
123125
, docxDropCap = mempty
124126
, docxListState = M.empty
125127
, docxPrevPara = mempty
128+
, docxTableCaptions = []
126129
}
127130

128131
data DEnv = DEnv { docxOptions :: ReaderOptions
@@ -491,15 +494,32 @@ singleParaToPlain blks
491494
singleton $ Plain ils
492495
singleParaToPlain blks = blks
493496

494-
cellToBlocks :: PandocMonad m => Docx.Cell -> DocxContext m Blocks
495-
cellToBlocks (Docx.Cell bps) = do
497+
cellToCell :: PandocMonad m => RowSpan -> Docx.Cell -> DocxContext m Pandoc.Cell
498+
cellToCell rowSpan (Docx.Cell gridSpan _ bps) = do
496499
blks <- smushBlocks <$> mapM bodyPartToBlocks bps
497-
return $ fromList $ blocksToDefinitions $ blocksToBullets $ toList blks
500+
let blks' = singleParaToPlain $ fromList $ blocksToDefinitions $ blocksToBullets $ toList blks
501+
return (cell AlignDefault rowSpan (ColSpan (fromIntegral gridSpan)) blks')
502+
503+
rowsToRows :: PandocMonad m => [Docx.Row] -> DocxContext m [Pandoc.Row]
504+
rowsToRows rows = do
505+
let rowspans = (fmap . fmap) (first RowSpan) (Docx.rowsToRowspans rows)
506+
cells <- traverse (traverse (uncurry cellToCell)) rowspans
507+
return (fmap (Pandoc.Row nullAttr) cells)
508+
509+
splitHeaderRows :: Bool -> [Docx.Row] -> ([Docx.Row], [Docx.Row])
510+
splitHeaderRows hasFirstRowFormatting rs = bimap reverse reverse $ fst
511+
$ if hasFirstRowFormatting
512+
then foldl' f ((take 1 rs, []), True) (drop 1 rs)
513+
else foldl' f (([], []), False) rs
514+
where
515+
f ((headerRows, bodyRows), previousRowWasHeader) r@(Docx.Row h cs)
516+
| h == HasTblHeader || (previousRowWasHeader && any isContinuationCell cs)
517+
= ((r : headerRows, bodyRows), True)
518+
| otherwise
519+
= ((headerRows, r : bodyRows), False)
520+
521+
isContinuationCell (Docx.Cell _ vm _) = vm == Docx.Continue
498522

499-
rowToBlocksList :: PandocMonad m => Docx.Row -> DocxContext m [Blocks]
500-
rowToBlocksList (Docx.Row cells) = do
501-
blksList <- mapM cellToBlocks cells
502-
return $ map singleParaToPlain blksList
503523

504524
-- like trimInlines, but also take out linebreaks
505525
trimSps :: Inlines -> Inlines
@@ -546,6 +566,11 @@ normalizeToClassName = T.map go . fromStyleName
546566
where go c | isSpace c = '-'
547567
| otherwise = c
548568

569+
bodyPartToTableCaption :: PandocMonad m => BodyPart -> DocxContext m (Maybe Blocks)
570+
bodyPartToTableCaption (TblCaption pPr parparts) =
571+
Just <$> bodyPartToBlocks (Paragraph pPr parparts)
572+
bodyPartToTableCaption _ = pure Nothing
573+
549574
bodyPartToBlocks :: PandocMonad m => BodyPart -> DocxContext m Blocks
550575
bodyPartToBlocks (Paragraph pPr parparts)
551576
| Just True <- pBidi pPr = do
@@ -637,50 +662,43 @@ bodyPartToBlocks (ListItem pPr _ _ _ parparts) =
637662
let pPr' = pPr {pStyle = constructBogusParStyleData "list-paragraph": pStyle pPr}
638663
in
639664
bodyPartToBlocks $ Paragraph pPr' parparts
665+
bodyPartToBlocks (TblCaption _ _) =
666+
return $ para mempty -- collected separately
640667
bodyPartToBlocks (Tbl _ _ _ []) =
641668
return $ para mempty
642-
bodyPartToBlocks (Tbl cap grid look parts@(r:rs)) = do
643-
let cap' = simpleCaption $ plain $ text cap
644-
(hdr, rows) = case firstRowFormatting look of
645-
True | null rs -> (Nothing, [r])
646-
| otherwise -> (Just r, rs)
647-
False -> (Nothing, r:rs)
648-
649-
cells <- mapM rowToBlocksList rows
669+
bodyPartToBlocks (Tbl cap grid look parts) = do
670+
captions <- gets docxTableCaptions
671+
fullCaption <- case captions of
672+
c : cs -> do
673+
modify (\s -> s { docxTableCaptions = cs })
674+
return (Just c)
675+
[] -> return Nothing
676+
let shortCaption = if T.null cap then Nothing else Just (toList (text cap))
677+
cap' = caption shortCaption (fromMaybe mempty fullCaption)
678+
(hdr, rows) = splitHeaderRows (firstRowFormatting look) parts
650679

651680
let width = maybe 0 maximum $ nonEmpty $ map rowLength parts
652681
rowLength :: Docx.Row -> Int
653-
rowLength (Docx.Row c) = length c
682+
rowLength (Docx.Row _ c) = sum (fmap (\(Docx.Cell gridSpan _ _) -> fromIntegral gridSpan) c)
654683

655-
let toRow = Pandoc.Row nullAttr . map simpleCell
656-
toHeaderRow l = [toRow l | not (null l)]
684+
headerCells <- rowsToRows hdr
685+
bodyCells <- rowsToRows rows
657686

658-
-- pad cells. New Text.Pandoc.Builder will do that for us,
659-
-- so this is for compatibility while we switch over.
660-
let cells' = map (\row -> toRow $ take width (row ++ repeat mempty)) cells
661-
662-
hdrCells <- case hdr of
663-
Just r' -> toHeaderRow <$> rowToBlocksList r'
664-
Nothing -> return []
665-
666-
-- The two following variables (horizontal column alignment and
667-
-- relative column widths) go to the default at the
668-
-- moment. Width information is in the TblGrid field of the Tbl,
669-
-- so should be possible. Alignment might be more difficult,
670-
-- since there doesn't seem to be a column entity in docx.
687+
-- Horizontal column alignment goes to the default at the moment. Getting
688+
-- it might be difficult, since there doesn't seem to be a column entity
689+
-- in docx.
671690
let alignments = replicate width AlignDefault
672691
totalWidth = sum grid
673692
widths = (\w -> ColWidth (fromInteger w / fromInteger totalWidth)) <$> grid
674693

675694
return $ table cap'
676695
(zip alignments widths)
677-
(TableHead nullAttr hdrCells)
678-
[TableBody nullAttr 0 [] cells']
696+
(TableHead nullAttr headerCells)
697+
[TableBody nullAttr 0 [] bodyCells]
679698
(TableFoot nullAttr [])
680699
bodyPartToBlocks (OMathPara e) =
681700
return $ para $ displayMath (writeTeX e)
682701

683-
684702
-- replace targets with generated anchors.
685703
rewriteLink' :: PandocMonad m => Inline -> DocxContext m Inline
686704
rewriteLink' l@(Link attr ils (T.uncons -> Just ('#',target), title)) = do
@@ -716,6 +734,8 @@ bodyToOutput :: PandocMonad m => Body -> DocxContext m (Meta, [Block])
716734
bodyToOutput (Body bps) = do
717735
let (metabps, blkbps) = sepBodyParts bps
718736
meta <- bodyPartsToMeta metabps
737+
captions <- catMaybes <$> mapM bodyPartToTableCaption blkbps
738+
modify (\s -> s { docxTableCaptions = captions })
719739
blks <- smushBlocks <$> mapM bodyPartToBlocks blkbps
720740
blks' <- rewriteLinks $ blocksToDefinitions $ blocksToBullets $ toList blks
721741
blks'' <- removeOrphanAnchors blks'

Diff for: src/Text/Pandoc/Readers/Docx/Parse.hs

+100-12
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,9 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..)
3333
, ParStyle
3434
, CharStyle(cStyleData)
3535
, Row(..)
36+
, TblHeader(..)
3637
, Cell(..)
38+
, VMerge(..)
3739
, TrackedChange(..)
3840
, ChangeType(..)
3941
, ChangeInfo(..)
@@ -50,6 +52,7 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..)
5052
, pHeading
5153
, constructBogusParStyleData
5254
, leftBiasedMergeRunStyle
55+
, rowsToRowspans
5356
) where
5457
import Text.Pandoc.Readers.Docx.Parse.Styles
5558
import Codec.Archive.Zip
@@ -225,6 +228,7 @@ defaultParagraphStyle = ParagraphStyle { pStyle = []
225228
data BodyPart = Paragraph ParagraphStyle [ParPart]
226229
| ListItem ParagraphStyle T.Text T.Text (Maybe Level) [ParPart]
227230
| Tbl T.Text TblGrid TblLook [Row]
231+
| TblCaption ParagraphStyle [ParPart]
228232
| OMathPara [Exp]
229233
deriving Show
230234

@@ -236,12 +240,61 @@ newtype TblLook = TblLook {firstRowFormatting::Bool}
236240
defaultTblLook :: TblLook
237241
defaultTblLook = TblLook{firstRowFormatting = False}
238242

239-
newtype Row = Row [Cell]
240-
deriving Show
243+
data Row = Row TblHeader [Cell] deriving Show
244+
245+
data TblHeader = HasTblHeader | NoTblHeader deriving (Show, Eq)
241246

242-
newtype Cell = Cell [BodyPart]
247+
data Cell = Cell GridSpan VMerge [BodyPart]
243248
deriving Show
244249

250+
type GridSpan = Integer
251+
252+
data VMerge = Continue
253+
-- ^ This cell should be merged with the one above it
254+
| Restart
255+
-- ^ This cell should not be merged with the one above it
256+
deriving (Show, Eq)
257+
258+
rowsToRowspans :: [Row] -> [[(Int, Cell)]]
259+
rowsToRowspans rows = let
260+
removeMergedCells = fmap (filter (\(_, Cell _ vmerge _) -> vmerge == Restart))
261+
in removeMergedCells (foldr f [] rows)
262+
where
263+
f :: Row -> [[(Int, Cell)]] -> [[(Int, Cell)]]
264+
f (Row _ cells) acc = let
265+
spans = g cells Nothing (listToMaybe acc)
266+
in spans : acc
267+
268+
g ::
269+
-- | The current row
270+
[Cell] ->
271+
-- | Number of columns left below
272+
Maybe Integer ->
273+
-- | (rowspan so far, cell) for the row below this one
274+
Maybe [(Int, Cell)] ->
275+
-- | (rowspan so far, cell) for this row
276+
[(Int, Cell)]
277+
g cells _ Nothing = zip (repeat 1) cells
278+
g cells columnsLeftBelow (Just rowBelow) =
279+
case cells of
280+
[] -> []
281+
thisCell@(Cell thisGridSpan _ _) : restOfRow -> case rowBelow of
282+
[] -> zip (repeat 1) cells
283+
(spanSoFarBelow, Cell gridSpanBelow vmerge _) : _ ->
284+
let spanSoFar = case vmerge of
285+
Restart -> 1
286+
Continue -> 1 + spanSoFarBelow
287+
columnsToDrop = thisGridSpan + (gridSpanBelow - fromMaybe gridSpanBelow columnsLeftBelow)
288+
(newColumnsLeftBelow, restOfRowBelow) = dropColumns columnsToDrop rowBelow
289+
in (spanSoFar, thisCell) : g restOfRow (Just newColumnsLeftBelow) (Just restOfRowBelow)
290+
291+
dropColumns :: Integer -> [(a, Cell)] -> (Integer, [(a, Cell)])
292+
dropColumns n [] = (n, [])
293+
dropColumns n cells@((_, Cell gridSpan _ _) : otherCells) =
294+
if n < gridSpan
295+
then (gridSpan - n, cells)
296+
else dropColumns (n - gridSpan) otherCells
297+
245298
leftBiasedMergeRunStyle :: RunStyle -> RunStyle -> RunStyle
246299
leftBiasedMergeRunStyle a b = RunStyle
247300
{ isBold = isBold a <|> isBold b
@@ -587,14 +640,31 @@ elemToRow ns element | isElem ns "w" "tr" element =
587640
do
588641
let cellElems = findChildrenByName ns "w" "tc" element
589642
cells <- mapD (elemToCell ns) cellElems
590-
return $ Row cells
643+
let hasTblHeader = maybe NoTblHeader (const HasTblHeader)
644+
(findChildByName ns "w" "trPr" element
645+
>>= findChildByName ns "w" "tblHeader")
646+
return $ Row hasTblHeader cells
591647
elemToRow _ _ = throwError WrongElem
592648

593649
elemToCell :: NameSpaces -> Element -> D Cell
594650
elemToCell ns element | isElem ns "w" "tc" element =
595651
do
652+
let properties = findChildByName ns "w" "tcPr" element
653+
let gridSpan = properties
654+
>>= findChildByName ns "w" "gridSpan"
655+
>>= findAttrByName ns "w" "val"
656+
>>= stringToInteger
657+
let vMerge = case properties >>= findChildByName ns "w" "vMerge" of
658+
Nothing -> Restart
659+
Just e ->
660+
fromMaybe Continue $ do
661+
s <- findAttrByName ns "w" "val" e
662+
case s of
663+
"continue" -> Just Continue
664+
"restart" -> Just Restart
665+
_ -> Nothing
596666
cellContents <- mapD (elemToBodyPart ns) (elChildren element)
597-
return $ Cell cellContents
667+
return $ Cell (fromMaybe 1 gridSpan) vMerge cellContents
598668
elemToCell _ _ = throwError WrongElem
599669

600670
elemToParIndentation :: NameSpaces -> Element -> Maybe ParIndentation
@@ -626,10 +696,9 @@ pNumInfo = getParStyleField numInfo . pStyle
626696
elemToBodyPart :: NameSpaces -> Element -> D BodyPart
627697
elemToBodyPart ns element
628698
| isElem ns "w" "p" element
629-
, (c:_) <- findChildrenByName ns "m" "oMathPara" element =
630-
do
631-
expsLst <- eitherToD $ readOMML $ showElement c
632-
return $ OMathPara expsLst
699+
, (c:_) <- findChildrenByName ns "m" "oMathPara" element = do
700+
expsLst <- eitherToD $ readOMML $ showElement c
701+
return $ OMathPara expsLst
633702
elemToBodyPart ns element
634703
| isElem ns "w" "p" element
635704
, Just (numId, lvl) <- getNumInfo ns element = do
@@ -647,13 +716,32 @@ elemToBodyPart ns element
647716
Nothing | Just (numId, lvl) <- pNumInfo parstyle -> do
648717
levelInfo <- lookupLevel numId lvl <$> asks envNumbering
649718
return $ ListItem parstyle numId lvl levelInfo parparts
650-
_ -> return $ Paragraph parstyle parparts
719+
_ -> let
720+
hasCaptionStyle = elem "Caption" (pStyleId <$> pStyle parstyle)
721+
722+
hasSimpleTableField = fromMaybe False $ do
723+
fldSimple <- findChildByName ns "w" "fldSimple" element
724+
instr <- findAttrByName ns "w" "instr" fldSimple
725+
pure ("Table" `elem` T.words instr)
726+
727+
hasComplexTableField = fromMaybe False $ do
728+
instrText <- findElementByName ns "w" "instrText" element
729+
pure ("Table" `elem` T.words (strContent instrText))
730+
731+
in if hasCaptionStyle && (hasSimpleTableField || hasComplexTableField)
732+
then return $ TblCaption parstyle parparts
733+
else return $ Paragraph parstyle parparts
734+
651735
elemToBodyPart ns element
652736
| isElem ns "w" "tbl" element = do
653-
let caption' = findChildByName ns "w" "tblPr" element
737+
let tblProperties = findChildByName ns "w" "tblPr" element
738+
caption' = tblProperties
654739
>>= findChildByName ns "w" "tblCaption"
655740
>>= findAttrByName ns "w" "val"
656741
caption = fromMaybe "" caption'
742+
description = fromMaybe "" $ tblProperties
743+
>>= findChildByName ns "w" "tblDescription"
744+
>>= findAttrByName ns "w" "val"
657745
grid' = case findChildByName ns "w" "tblGrid" element of
658746
Just g -> elemToTblGrid ns g
659747
Nothing -> return []
@@ -666,7 +754,7 @@ elemToBodyPart ns element
666754
grid <- grid'
667755
tblLook <- tblLook'
668756
rows <- mapD (elemToRow ns) (elChildren element)
669-
return $ Tbl caption grid tblLook rows
757+
return $ Tbl (caption <> description) grid tblLook rows
670758
elemToBodyPart _ _ = throwError WrongElem
671759

672760
lookupRelationship :: DocumentLocation -> RelId -> [Relationship] -> Maybe Target

Diff for: src/Text/Pandoc/Readers/Docx/Util.hs

+7
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ module Text.Pandoc.Readers.Docx.Util (
1919
, elemToNameSpaces
2020
, findChildByName
2121
, findChildrenByName
22+
, findElementByName
2223
, findAttrByName
2324
) where
2425

@@ -56,6 +57,12 @@ findChildrenByName ns pref name el =
5657
let ns' = ns <> elemToNameSpaces el
5758
in findChildren (elemName ns' pref name) el
5859

60+
-- | Like 'findChildrenByName', but searches descendants.
61+
findElementByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element
62+
findElementByName ns pref name el =
63+
let ns' = ns <> elemToNameSpaces el
64+
in findElement (elemName ns' pref name) el
65+
5966
findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text
6067
findAttrByName ns pref name el =
6168
let ns' = ns <> elemToNameSpaces el

Diff for: test/Tests/Readers/Docx.hs

+16
Original file line numberDiff line numberDiff line change
@@ -317,14 +317,30 @@ tests = [ testGroup "document"
317317
"tables with lists in cells"
318318
"docx/table_with_list_cell.docx"
319319
"docx/table_with_list_cell.native"
320+
, testCompare
321+
"a table with a header which contains rowspans greater than 1"
322+
"docx/table_header_rowspan.docx"
323+
"docx/table_header_rowspan.native"
320324
, testCompare
321325
"tables with one row"
322326
"docx/table_one_row.docx"
323327
"docx/table_one_row.native"
328+
, testCompare
329+
"tables with just one row, which is a header"
330+
"docx/table_one_header_row.docx"
331+
"docx/table_one_header_row.native"
324332
, testCompare
325333
"tables with variable width"
326334
"docx/table_variable_width.docx"
327335
"docx/table_variable_width.native"
336+
, testCompare
337+
"tables with captions which contain a Table field"
338+
"docx/table_captions_with_field.docx"
339+
"docx/table_captions_with_field.native"
340+
, testCompare
341+
"tables with captions which don't contain a Table field"
342+
"docx/table_captions_no_field.docx"
343+
"docx/table_captions_no_field.native"
328344
, testCompare
329345
"code block"
330346
"docx/codeblock.docx"

0 commit comments

Comments
 (0)