Skip to content

Conversation

@sylvestre
Copy link
Contributor

it requires many changes to make it more robust

@sylvestre
Copy link
Contributor Author

the script to convert md => fluent:

#!/usr/bin/env python3
"""
Convert markdown documentation files to fluent format.
"""

import re
import argparse
from pathlib import Path


def extract_command_name(filepath):
    """Extract command name from file path or content."""
    # Try to get from filename first
    filename = Path(filepath).stem
    if filename and filename != "README":
        return filename

    # Fallback to parsing the markdown header
    with open(filepath, "r", encoding="utf-8") as f:
        content = f.read()
        match = re.search(r"^#\s+(\w+)", content, re.MULTILINE)
        if match:
            return match.group(1)

    return None


def escape_fluent_special_chars(text):
    """Escape special characters that might cause issues in Fluent files."""
    if not text:
        return text

    # Handle curly braces which have special meaning in Fluent
    text = text.replace('{', '\\{').replace('}', '\\}')

    return text


def clean_markdown_for_fluent(text):
    """Clean markdown formatting to make it Fluent-compatible."""
    if not text:
        return text

    lines = text.split('\n')
    cleaned_lines = []
    in_code_block = False

    for line in lines:
        original_line = line

        # Handle code blocks - preserve them as-is but without the backticks
        if line.strip().startswith('```'):
            in_code_block = not in_code_block
            if not in_code_block and line.strip() == '```':
                # Skip the closing backticks
                continue
            elif in_code_block and line.strip() == '```':
                # Skip the opening backticks
                continue

        if in_code_block:
            # Inside code blocks, preserve formatting
            cleaned_lines.append(line)
            continue

        # Keep markdown headers as-is (###, ####, etc.)
        # Don't convert them - just preserve them

        # Convert markdown bullet points to simple format
        if re.match(r'^\s*\*\s+', line):
            # Extract the content after the bullet point
            content = re.sub(r'^\s*\*\s+', '', line)
            # Remove inline code backticks but preserve content
            content = re.sub(r'`([^`]*)`', r'\1', content)
            # Get the original indentation level
            indent_level = len(line) - len(line.lstrip())
            # Convert to simple dash format
            line = ' ' * indent_level + '- ' + content

        else:
            # Remove inline code backticks but preserve content for non-bullet lines
            line = re.sub(r'`([^`]*)`', r'\1', line)

        # Clean up excessive backslash escaping (common in printf docs)
        # But be careful not to break intentional escaping
        line = re.sub(r'\\\\\\\\', r'\\\\', line)  # Convert \\\\ to \\

        cleaned_lines.append(line)

    return '\n'.join(cleaned_lines)


def preserve_formatting(text):
    """Preserve formatting while cleaning up excessive whitespace."""
    if not text:
        return ""

    # First clean markdown syntax that causes Fluent issues
    text = clean_markdown_for_fluent(text)

    # Split into lines to preserve line structure
    lines = text.split('\n')
    processed_lines = []

    for line in lines:
        # Preserve leading whitespace (indentation) but clean up excessive spaces within the line
        leading_spaces = len(line) - len(line.lstrip())
        content = line.strip()

        if content:
            # Clean up multiple spaces within content but preserve single spaces
            content = re.sub(r' +', ' ', content)
            # Escape special Fluent characters
            content = escape_fluent_special_chars(content)
            # Reconstruct line with original indentation
            processed_lines.append(' ' * leading_spaces + content)
        else:
            # Preserve empty lines
            processed_lines.append('')

    return '\n'.join(processed_lines)


def parse_markdown_file(filepath):
    """Parse a markdown file and extract relevant sections."""
    with open(filepath, "r", encoding="utf-8") as f:
        content = f.read()

    # Extract command name
    command_name = extract_command_name(filepath)
    if not command_name:
        return None

    # Extract usage (code blocks)
    usage_patterns = []
    code_blocks = re.findall(r"```\n?(.*?)\n?```", content, re.DOTALL)
    for block in code_blocks:
        # Clean up the usage lines while preserving structure
        lines = [line.rstrip() for line in block.split("\n")]
        # Remove empty lines at start and end, but preserve internal empty lines
        while lines and not lines[0].strip():
            lines.pop(0)
        while lines and not lines[-1].strip():
            lines.pop()

        if lines:
            usage_patterns.extend(lines)
            break

    # Extract description (text immediately after usage block, before ## sections)
    description = ""
    # Look for text between the first ``` block and the first ## header
    usage_end_match = re.search(r"```.*?```\s*\n", content, re.DOTALL)
    if usage_end_match:
        remaining_content = content[usage_end_match.end():]
        # Get text until first ## header
        desc_match = re.search(r"^(.*?)(?=\n##|\Z)", remaining_content, re.DOTALL)
        if desc_match:
            desc_text = desc_match.group(1).strip()
            if desc_text:
                description = preserve_formatting(desc_text)

    # Extract after-help section (everything after "## After Help")
    after_help = ""
    after_help_match = re.search(
        r"##\s*After [Hh]elp\s*\n(.*?)(?=\Z)", content, re.MULTILINE | re.DOTALL
    )
    if after_help_match:
        desc_text = after_help_match.group(1).strip()
        if desc_text:
            after_help = preserve_formatting(desc_text)

    return {
        "command": command_name,
        "usage": usage_patterns,
        "description": description,
        "after_help": after_help,
    }


def format_fluent_multiline(text):
    """Format text for Fluent multi-line values."""
    if not text or '\n' not in text:
        return text

    lines = text.split('\n')
    formatted_lines = []

    for i, line in enumerate(lines):
        if i == 0:
            # First line stays as-is
            formatted_lines.append(line)
        else:
            # All subsequent lines need at least 2 spaces of indentation for Fluent
            if line.strip():
                # Preserve existing indentation but ensure minimum 2 spaces
                existing_indent = len(line) - len(line.lstrip())
                min_indent = max(2, existing_indent + 2)
                formatted_lines.append(' ' * min_indent + line.lstrip())
            else:
                # Empty lines
                formatted_lines.append('')

    return '\n'.join(formatted_lines)


def convert_to_fluent(parsed_data):
    """Convert parsed data to fluent format."""
    if parsed_data is None:
        return ""

    command = parsed_data["command"]
    lines = []

    # Add description (about)
    if parsed_data["description"]:
        formatted_desc = format_fluent_multiline(parsed_data["description"])
        lines.append(f"{command}-about = {formatted_desc}")

    # Add usage
    if parsed_data["usage"]:
        # Join multiple usage patterns
        usage_text = '\n'.join(parsed_data["usage"])
        formatted_usage = format_fluent_multiline(usage_text)
        lines.append(f"{command}-usage = {formatted_usage}")

    # Add after-help if present
    if parsed_data["after_help"]:
        formatted_help = format_fluent_multiline(parsed_data["after_help"])
        lines.append(f"{command}-after-help = {formatted_help}")

    return "\n".join(lines)


def get_output_path(input_file):
    """Generate the output .ftl file path based on input markdown file path."""
    input_path = Path(input_file)

    # Extract the program directory (e.g., src/uu/ls/ls.md -> ls)
    if len(input_path.parts) >= 3 and input_path.parts[-3] == "uu":
        program_name = input_path.parts[-2]  # Get the program directory name
        base_dir = input_path.parent  # src/uu/PROGRAM/
        output_dir = base_dir / "locales"
        output_file = output_dir / "en-US.ftl"
        return output_file

    # Fallback: just change extension to .ftl in same directory
    return input_path.with_suffix(".ftl")


def process_directory(input_dir, output_file=None):
    """Process all markdown files in a directory."""
    input_path = Path(input_dir)
    processed_count = 0

    # Find all markdown files
    md_files = list(input_path.rglob("*.md"))

    for md_file in sorted(md_files):
        print(f"Processing: {md_file}")
        try:
            parsed = parse_markdown_file(md_file)
            if parsed:
                fluent_text = convert_to_fluent(parsed)
                if fluent_text:
                    # Generate output path
                    if output_file:
                        output_path = Path(output_file)
                    else:
                        output_path = get_output_path(md_file)

                    # Create output directory if it doesn't exist
                    output_path.parent.mkdir(parents=True, exist_ok=True)

                    # Write the fluent file
                    with open(output_path, "w", encoding="utf-8") as f:
                        f.write(fluent_text)

                    print(f"  ✓ Converted {parsed['command']} -> {output_path}")
                    processed_count += 1
                else:
                    print(f"  ✗ No content extracted from {md_file}")
            else:
                print(f"  ✗ Failed to parse {md_file}")
        except Exception as e:
            print(f"  ✗ Error processing {md_file}: {e}")

    print(f"\nProcessed {processed_count} files successfully.")
    return processed_count


def validate_fluent_content(content):
    """Validate that the generated content doesn't have obvious Fluent syntax issues."""
    lines = content.split('\n')
    issues = []

    for i, line in enumerate(lines, 1):
        # Check for unescaped curly braces
        if '{' in line and '\\{' not in line:
            issues.append(f"Line {i}: Unescaped curly brace might cause parsing issues")

        # Check for proper indentation in multi-line values
        if line.startswith(' ') and len(line.strip()) > 0:
            if not line.startswith('  '):  # Fluent needs at least 2 spaces
                issues.append(f"Line {i}: Insufficient indentation (needs at least 2 spaces)")

        # Check for markdown bullet points that weren't converted
        if re.match(r'^\s*\*\s+`', line):
            issues.append(f"Line {i}: Markdown bullet point with backticks may cause parsing issues")

        # Check for problematic characters at start of lines (after indentation)
        stripped = line.lstrip()
        if stripped and not re.match(r'^[a-zA-Z0-9\-_:.\s]', stripped):
            issues.append(f"Line {i}: Line starts with potentially problematic character: '{stripped[0]}'")

    return issues


def debug_fluent_output(content, max_lines=10):
    """Debug helper to show problematic lines in generated content."""
    lines = content.split('\n')
    print(f"\nFirst {max_lines} lines of generated content:")
    print("-" * 50)

    for i, line in enumerate(lines[:max_lines], 1):
        # Show special characters visibly
        display_line = line.replace('\t', '→').replace(' ', '·')
        print(f"{i:3}: '{display_line}'")

        # Check for potential issues
        if line.strip().startswith('*'):
            print(f"     ^ WARNING: Line starts with asterisk")
        if '`' in line:
            print(f"     ^ WARNING: Contains backticks")

    if len(lines) > max_lines:
        print(f"... ({len(lines) - max_lines} more lines)")
    print("-" * 50)


def debug_parsing_steps(filepath, parsed_data):
    """Debug helper to show what was extracted during parsing."""
    print(f"\n--- Debug: Parsing steps for {filepath} ---")

    if parsed_data:
        print(f"Command: {parsed_data['command']}")

        print(f"\nUsage ({len(parsed_data['usage'])} lines):")
        for i, usage in enumerate(parsed_data['usage']):
            print(f"  {i+1}: '{usage}'")

        print(f"\nDescription ({len(parsed_data['description'])} chars):")
        if parsed_data['description']:
            # Show first 100 chars
            desc_preview = parsed_data['description'][:100]
            if len(parsed_data['description']) > 100:
                desc_preview += "..."
            print(f"  '{desc_preview}'")
        else:
            print("  (empty)")

        print(f"\nAfter-help ({len(parsed_data['after_help'])} chars):")
        if parsed_data['after_help']:
            # Show first 200 chars and last 100 chars
            help_text = parsed_data['after_help']
            if len(help_text) > 300:
                preview = help_text[:200] + "\n  ...\n  " + help_text[-100:]
            else:
                preview = help_text
            print(f"  '{preview}'")
        else:
            print("  (empty)")
    else:
        print("  Failed to parse!")

    print("--- End debug ---\n")


def process_single_file(input_file, output_file=None):
    """Process a single markdown file."""
    print(f"Processing: {input_file}")

    try:
        parsed = parse_markdown_file(input_file)
        if parsed:
            print(f"  - Command: {parsed['command']}")
            print(f"  - Usage patterns: {len(parsed['usage'])}")
            print(f"  - Description length: {len(parsed['description']) if parsed['description'] else 0}")
            print(f"  - After-help length: {len(parsed['after_help']) if parsed['after_help'] else 0}")

            # Add debugging
            debug_parsing_steps(input_file, parsed)

            fluent_text = convert_to_fluent(parsed)
            if fluent_text:
                # Validate the generated content
                issues = validate_fluent_content(fluent_text)
                if issues:
                    print("  ⚠️  Potential issues found:")
                    for issue in issues:
                        print(f"     {issue}")

                # Debug output for troubleshooting
                debug_fluent_output(fluent_text)

                # Generate output path
                if output_file:
                    output_path = Path(output_file)
                else:
                    output_path = get_output_path(input_file)

                # Create output directory if it doesn't exist
                output_path.parent.mkdir(parents=True, exist_ok=True)

                # Write the fluent file
                with open(output_path, "w", encoding="utf-8") as f:
                    f.write(fluent_text)

                print(f"✓ Output written to: {output_path}")
                return fluent_text
            else:
                print("No content could be extracted.")
        else:
            print("Failed to parse the file.")
    except Exception as e:
        print(f"Error processing file: {e}")
        import traceback
        traceback.print_exc()

    return None


def main():
    parser = argparse.ArgumentParser(
        description="Convert markdown docs to fluent format"
    )
    parser.add_argument("input", help="Input markdown file or directory")
    parser.add_argument(
        "-o",
        "--output",
        help="Output file (only used for single file processing, ignored for directory processing)",
    )

    args = parser.parse_args()

    input_path = Path(args.input)

    if input_path.is_file():
        process_single_file(args.input, args.output)
    elif input_path.is_dir():
        if args.output:
            print(
                "Warning: -o/--output option ignored when processing directories. Files will be generated in their respective locales/ directories."
            )
        process_directory(args.input)
    else:
        print(f"Error: {args.input} is not a valid file or directory")
        return 1

    return 0


if __name__ == "__main__":
    exit(main())

and a second script to update a bunch of the code

LIST=$(rg ".about\(" src/uu/|cut -d: -f1|sort)
echo $LIST
for f in $LIST; do
    p=$(echo $f|cut -d/ -f3)
    echo $p
    sed -i -e "s|.about(ABOUT)|.about(get_message(\"$p-about\"))|" $f
    sed -i -e "s|.format_usage(USAGE)|\(format_usage(\&get_message(\"$p-usage\"))|" $f
    sed -i -e "s|.after_help(AFTER_HELP)|.after_help(get_message(\"$p-after-help\"))|" $f


    sed -i -e "s|const ABOUT: &str = help_about.*|use uucore::locale::{self, get_message};|g" $f
    sed -i -e "s|static const ABOUT.*|use uucore::locale::{self, get_message};|g" $f
    sed -i -e "s|static ABOUT.*|use uucore::locale::{self, get_message};|g" $f

    sed -i -e "/const USAGE/d" $f
    sed -i -e "/const AFTER_HELP/d" $f
done

@github-actions
Copy link

github-actions bot commented Jun 3, 2025

GNU testsuite comparison:

GNU test failed: tests/basenc/basenc. tests/basenc/basenc is passing on 'main'. Maybe you have to rebase?
GNU test failed: tests/cksum/cksum-base64. tests/cksum/cksum-base64 is passing on 'main'. Maybe you have to rebase?
GNU test failed: tests/env/env-S. tests/env/env-S is passing on 'main'. Maybe you have to rebase?
GNU test failed: tests/factor/factor. tests/factor/factor is passing on 'main'. Maybe you have to rebase?
GNU test failed: tests/misc/comm. tests/misc/comm is passing on 'main'. Maybe you have to rebase?
GNU test failed: tests/mv/diag. tests/mv/diag is passing on 'main'. Maybe you have to rebase?
Skipping an intermittent issue tests/misc/tee (passes in this run but fails in the 'main' branch)
Skipping an intermittent issue tests/timeout/timeout (passes in this run but fails in the 'main' branch)
Congrats! The gnu test tests/cp/link-heap is no longer failing!

@github-actions
Copy link

github-actions bot commented Jun 3, 2025

GNU testsuite comparison:

GNU test failed: tests/basenc/basenc. tests/basenc/basenc is passing on 'main'. Maybe you have to rebase?
Skipping an intermittent issue tests/misc/tee (passes in this run but fails in the 'main' branch)
Skipping an intermittent issue tests/timeout/timeout (passes in this run but fails in the 'main' branch)
Congrats! The gnu test tests/cp/link-heap is no longer failing!

@github-actions
Copy link

github-actions bot commented Jun 3, 2025

GNU testsuite comparison:

Congrats! The gnu test tests/cp/link-heap is no longer failing!

@sylvestre sylvestre requested a review from cakebaker June 4, 2025 06:41
@@ -143,6 +143,25 @@ pub fn disable_rust_signal_handlers() -> Result<(), Errno> {
Ok(())
}

pub fn get_canonical_util_name(util_name: &str) -> &str {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, it is duplicated code but i don't know how to fix it yet

@github-actions
Copy link

github-actions bot commented Jun 4, 2025

GNU testsuite comparison:

Congrats! The gnu test tests/cp/link-heap is no longer failing!

const ABOUT: &str = help_about!("env.md");
const USAGE: &str = help_usage!("env.md");
const AFTER_HELP: &str = help_section!("after help", "env.md");
use uucore::locale::get_message;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would move the import to the other imports.

const NAME: &str = "hashsum";
const ABOUT: &str = help_about!("hashsum.md");
const USAGE: &str = help_usage!("hashsum.md");
use uucore::locale::get_message;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would move the import to the other imports.

const ABOUT: &str = help_about!("id.md");
const USAGE: &str = help_usage!("id.md");
const AFTER_HELP: &str = help_section!("after help", "id.md");
use uucore::locale::get_message;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would move the import to the other imports.

const ABOUT: &str = help_about!("ln.md");
const USAGE: &str = help_usage!("ln.md");
const AFTER_HELP: &str = help_section!("after help", "ln.md");
use uucore::locale::get_message;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would move the import to the other imports.

const ABOUT: &str = help_about!("mkdir.md");
const USAGE: &str = help_usage!("mkdir.md");
const AFTER_HELP: &str = help_section!("after help", "mkdir.md");
use uucore::locale::get_message;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would move the import to the other imports.

const ABOUT: &str = help_about!("mv.md");
const USAGE: &str = help_usage!("mv.md");
const AFTER_HELP: &str = help_section!("after help", "mv.md");
use uucore::locale::get_message;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would move the import to the other imports.


const ABOUT: &str = help_about!("nice.md");
const USAGE: &str = help_usage!("nice.md");
use uucore::locale::get_message;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would move the import to the other imports.


const ABOUT: &str = help_about!("nproc.md");
const USAGE: &str = help_usage!("nproc.md");
use uucore::locale::get_message;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would move the import to the other imports.

@github-actions
Copy link

github-actions bot commented Jun 4, 2025

GNU testsuite comparison:

Skipping an intermittent issue tests/misc/tee (passes in this run but fails in the 'main' branch)
Congrats! The gnu test tests/cp/link-heap is no longer failing!

@sylvestre sylvestre requested a review from cakebaker June 4, 2025 20:55
@cakebaker cakebaker merged commit ccc6233 into uutils:main Jun 5, 2025
74 checks passed
@cakebaker
Copy link
Contributor

Good work :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants