diff --git a/utils/grass_html2md.sh b/utils/grass_html2md.sh index 9ac2e548ae1..af51d8b37ee 100755 --- a/utils/grass_html2md.sh +++ b/utils/grass_html2md.sh @@ -9,14 +9,14 @@ set -eu # wget # # Author(s): -# Martin Landa, Markus Neteler +# Martin Landa, Markus Neteler, Corey White # # Usage: # If you have "pandoc" in PATH, execute for HTML file conversion in # current directory and subdirectories: # ./utils/grass_html2md.sh # -# COPYRIGHT: (C) 2024 by the GRASS Development Team +# COPYRIGHT: (C) 2024-2025 by the GRASS Development Team # # This program is free software under the GNU General Public # License (>=v2). Read the file COPYING that comes with GRASS @@ -43,6 +43,22 @@ trap "exitprocedure" 2 3 15 # path to LUA file (./utils/pandoc_codeblock.lua) UTILSPATH="utils" +process_file() { + local file="$1" # temporary file + local f="$2" # original file + + cat "$file" | \ + sed 's#
#
#g' | \
+        sed 's#
##g' | \ + pandoc -f html-native_divs \ + -t gfm+pipe_tables+gfm_auto_identifiers --wrap=auto \ + --lua-filter "${UTILSPATH}/pandoc_codeblock.lua" | \ + sed 's+ \\\$+ \$+g' | sed 's+%20+-+g' > "${f%%.html}.md" + + rm -f "$file" + +} + # run recursively: HTML to MD for f in $(find . -name *.html); do echo "${f}" @@ -57,13 +73,6 @@ for f in $(find . -name *.html); do s|_KEEPHTML||g; ' "${f%%.html}.html" > "${f%%.html}_tmp.html" - cat "${f%%.html}_tmp.html" | \ - sed 's#
#
#g' | \
-        sed 's#
##g' | \ - pandoc --from=html --to=markdown -t gfm \ - --lua-filter "${UTILSPATH}/pandoc_codeblock.lua" | \ - sed 's+ \\\$+ \$+g' | sed 's+%20+-+g' > "${f%%.html}.md" - - rm -f "${f%%.html}_tmp.html" + process_file "${f%%.html}_tmp.html" ${f%%.html}.html done diff --git a/utils/pandoc_codeblock.lua b/utils/pandoc_codeblock.lua index e2a0a54910f..4e45faa6147 100644 --- a/utils/pandoc_codeblock.lua +++ b/utils/pandoc_codeblock.lua @@ -2,7 +2,71 @@ -- Test cases -- raster/r.sun/r.sun.html --- Function to convert code blocks to markdown -function CodeBlock (cb) - return pandoc.RawBlock('markdown', '```shell\n' .. cb.text .. '\n```\n') +-- Enforces markdownlint rules during Pandoc conversion +local MAX_LINE_LENGTH = 120 -- Adjust as needed for MD013 + +local LIST_INDENT = "" + +function Image(el) + -- Convert HTML to Markdown ![alt text](src) + local alt_text = el.alt or "image-alt" + local src = el.src + return pandoc.Image({pandoc.Str(alt_text)}, src) +end + +-- Fixes some edge cases with raw HTML elements +function RawInline(el) + if el.format == "html" then + if el.text:match("") then + return pandoc.RawInline("markdown", "*") + elseif el.text:match("") then + return pandoc.RawInline("markdown", "*") + elseif el.text:match("") then + return pandoc.RawInline("markdown", "*") + elseif el.text:match("") then + return pandoc.RawInline("markdown", "*") + elseif el.text:match(" ") then + return pandoc.RawInline("markdown", " ") + elseif el.text:match("<") then + return pandoc.RawInline("markdown", "<") + elseif el.text:match(">") then + return pandoc.RawInline("markdown", ">") + end + end + return el +end + +function CodeBlock(el) + -- Ensure fenced code blocks with backticks + local lang = el.classes[1] or "sh" -- Preserve language if available + return pandoc.RawBlock("markdown", "```" .. lang .. "\n" .. el.text .. "\n```") +end + +function Header(el) + return pandoc.Header(el.level, el.content) -- Ensure ATX-style headers +end + +function Str(el) + local text = el.text:gsub("%s+$", "") -- Remove trailing spaces + return pandoc.Str(text) +end + +function Pandoc(doc) + -- Process document with defined rules + local new_blocks = {} + local previous_blank = false + + for _, block in ipairs(doc.blocks) do + if block.t == "Para" and #block.content == 0 then + if not previous_blank then + table.insert(new_blocks, block) + end + previous_blank = true + else + table.insert(new_blocks, block) + previous_blank = false + end + end + + return pandoc.Pandoc(new_blocks) end