l10n: convert the md files to fluent #8054

sylvestre · 2025-06-03T17:19:33Z

it requires many changes to make it more robust

sylvestre · 2025-06-03T17:20:44Z

the script to convert md => fluent:

#!/usr/bin/env python3
"""
Convert markdown documentation files to fluent format.
"""

import re
import argparse
from pathlib import Path


def extract_command_name(filepath):
    """Extract command name from file path or content."""
    # Try to get from filename first
    filename = Path(filepath).stem
    if filename and filename != "README":
        return filename

    # Fallback to parsing the markdown header
    with open(filepath, "r", encoding="utf-8") as f:
        content = f.read()
        match = re.search(r"^#\s+(\w+)", content, re.MULTILINE)
        if match:
            return match.group(1)

    return None


def escape_fluent_special_chars(text):
    """Escape special characters that might cause issues in Fluent files."""
    if not text:
        return text

    # Handle curly braces which have special meaning in Fluent
    text = text.replace('{', '\\{').replace('}', '\\}')

    return text


def clean_markdown_for_fluent(text):
    """Clean markdown formatting to make it Fluent-compatible."""
    if not text:
        return text

    lines = text.split('\n')
    cleaned_lines = []
    in_code_block = False

    for line in lines:
        original_line = line

        # Handle code blocks - preserve them as-is but without the backticks
        if line.strip().startswith('```'):
            in_code_block = not in_code_block
            if not in_code_block and line.strip() == '```':
                # Skip the closing backticks
                continue
            elif in_code_block and line.strip() == '```':
                # Skip the opening backticks
                continue

        if in_code_block:
            # Inside code blocks, preserve formatting
            cleaned_lines.append(line)
            continue

        # Keep markdown headers as-is (###, ####, etc.)
        # Don't convert them - just preserve them

        # Convert markdown bullet points to simple format
        if re.match(r'^\s*\*\s+', line):
            # Extract the content after the bullet point
            content = re.sub(r'^\s*\*\s+', '', line)
            # Remove inline code backticks but preserve content
            content = re.sub(r'`([^`]*)`', r'\1', content)
            # Get the original indentation level
            indent_level = len(line) - len(line.lstrip())
            # Convert to simple dash format
            line = ' ' * indent_level + '- ' + content

        else:
            # Remove inline code backticks but preserve content for non-bullet lines
            line = re.sub(r'`([^`]*)`', r'\1', line)

        # Clean up excessive backslash escaping (common in printf docs)
        # But be careful not to break intentional escaping
        line = re.sub(r'\\\\\\\\', r'\\\\', line)  # Convert \\\\ to \\

        cleaned_lines.append(line)

    return '\n'.join(cleaned_lines)


def preserve_formatting(text):
    """Preserve formatting while cleaning up excessive whitespace."""
    if not text:
        return ""

    # First clean markdown syntax that causes Fluent issues
    text = clean_markdown_for_fluent(text)

    # Split into lines to preserve line structure
    lines = text.split('\n')
    processed_lines = []

    for line in lines:
        # Preserve leading whitespace (indentation) but clean up excessive spaces within the line
        leading_spaces = len(line) - len(line.lstrip())
        content = line.strip()

        if content:
            # Clean up multiple spaces within content but preserve single spaces
            content = re.sub(r' +', ' ', content)
            # Escape special Fluent characters
            content = escape_fluent_special_chars(content)
            # Reconstruct line with original indentation
            processed_lines.append(' ' * leading_spaces + content)
        else:
            # Preserve empty lines
            processed_lines.append('')

    return '\n'.join(processed_lines)


def parse_markdown_file(filepath):
    """Parse a markdown file and extract relevant sections."""
    with open(filepath, "r", encoding="utf-8") as f:
        content = f.read()

    # Extract command name
    command_name = extract_command_name(filepath)
    if not command_name:
        return None

    # Extract usage (code blocks)
    usage_patterns = []
    code_blocks = re.findall(r"```\n?(.*?)\n?```", content, re.DOTALL)
    for block in code_blocks:
        # Clean up the usage lines while preserving structure
        lines = [line.rstrip() for line in block.split("\n")]
        # Remove empty lines at start and end, but preserve internal empty lines
        while lines and not lines[0].strip():
            lines.pop(0)
        while lines and not lines[-1].strip():
            lines.pop()

        if lines:
            usage_patterns.extend(lines)
            break

    # Extract description (text immediately after usage block, before ## sections)
    description = ""
    # Look for text between the first ``` block and the first ## header
    usage_end_match = re.search(r"```.*?```\s*\n", content, re.DOTALL)
    if usage_end_match:
        remaining_content = content[usage_end_match.end():]
        # Get text until first ## header
        desc_match = re.search(r"^(.*?)(?=\n##|\Z)", remaining_content, re.DOTALL)
        if desc_match:
            desc_text = desc_match.group(1).strip()
            if desc_text:
                description = preserve_formatting(desc_text)

    # Extract after-help section (everything after "## After Help")
    after_help = ""
    after_help_match = re.search(
        r"##\s*After [Hh]elp\s*\n(.*?)(?=\Z)", content, re.MULTILINE | re.DOTALL
    )
    if after_help_match:
        desc_text = after_help_match.group(1).strip()
        if desc_text:
            after_help = preserve_formatting(desc_text)

    return {
        "command": command_name,
        "usage": usage_patterns,
        "description": description,
        "after_help": after_help,
    }


def format_fluent_multiline(text):
    """Format text for Fluent multi-line values."""
    if not text or '\n' not in text:
        return text

    lines = text.split('\n')
    formatted_lines = []

    for i, line in enumerate(lines):
        if i == 0:
            # First line stays as-is
            formatted_lines.append(line)
        else:
            # All subsequent lines need at least 2 spaces of indentation for Fluent
            if line.strip():
                # Preserve existing indentation but ensure minimum 2 spaces
                existing_indent = len(line) - len(line.lstrip())
                min_indent = max(2, existing_indent + 2)
                formatted_lines.append(' ' * min_indent + line.lstrip())
            else:
                # Empty lines
                formatted_lines.append('')

    return '\n'.join(formatted_lines)


def convert_to_fluent(parsed_data):
    """Convert parsed data to fluent format."""
    if parsed_data is None:
        return ""

    command = parsed_data["command"]
    lines = []

    # Add description (about)
    if parsed_data["description"]:
        formatted_desc = format_fluent_multiline(parsed_data["description"])
        lines.append(f"{command}-about = {formatted_desc}")

    # Add usage
    if parsed_data["usage"]:
        # Join multiple usage patterns
        usage_text = '\n'.join(parsed_data["usage"])
        formatted_usage = format_fluent_multiline(usage_text)
        lines.append(f"{command}-usage = {formatted_usage}")

    # Add after-help if present
    if parsed_data["after_help"]:
        formatted_help = format_fluent_multiline(parsed_data["after_help"])
        lines.append(f"{command}-after-help = {formatted_help}")

    return "\n".join(lines)


def get_output_path(input_file):
    """Generate the output .ftl file path based on input markdown file path."""
    input_path = Path(input_file)

    # Extract the program directory (e.g., src/uu/ls/ls.md -> ls)
    if len(input_path.parts) >= 3 and input_path.parts[-3] == "uu":
        program_name = input_path.parts[-2]  # Get the program directory name
        base_dir = input_path.parent  # src/uu/PROGRAM/
        output_dir = base_dir / "locales"
        output_file = output_dir / "en-US.ftl"
        return output_file

    # Fallback: just change extension to .ftl in same directory
    return input_path.with_suffix(".ftl")


def process_directory(input_dir, output_file=None):
    """Process all markdown files in a directory."""
    input_path = Path(input_dir)
    processed_count = 0

    # Find all markdown files
    md_files = list(input_path.rglob("*.md"))

    for md_file in sorted(md_files):
        print(f"Processing: {md_file}")
        try:
            parsed = parse_markdown_file(md_file)
            if parsed:
                fluent_text = convert_to_fluent(parsed)
                if fluent_text:
                    # Generate output path
                    if output_file:
                        output_path = Path(output_file)
                    else:
                        output_path = get_output_path(md_file)

                    # Create output directory if it doesn't exist
                    output_path.parent.mkdir(parents=True, exist_ok=True)

                    # Write the fluent file
                    with open(output_path, "w", encoding="utf-8") as f:
                        f.write(fluent_text)

                    print(f"  ✓ Converted {parsed['command']} -> {output_path}")
                    processed_count += 1
                else:
                    print(f"  ✗ No content extracted from {md_file}")
            else:
                print(f"  ✗ Failed to parse {md_file}")
        except Exception as e:
            print(f"  ✗ Error processing {md_file}: {e}")

    print(f"\nProcessed {processed_count} files successfully.")
    return processed_count


def validate_fluent_content(content):
    """Validate that the generated content doesn't have obvious Fluent syntax issues."""
    lines = content.split('\n')
    issues = []

    for i, line in enumerate(lines, 1):
        # Check for unescaped curly braces
        if '{' in line and '\\{' not in line:
            issues.append(f"Line {i}: Unescaped curly brace might cause parsing issues")

        # Check for proper indentation in multi-line values
        if line.startswith(' ') and len(line.strip()) > 0:
            if not line.startswith('  '):  # Fluent needs at least 2 spaces
                issues.append(f"Line {i}: Insufficient indentation (needs at least 2 spaces)")

        # Check for markdown bullet points that weren't converted
        if re.match(r'^\s*\*\s+`', line):
            issues.append(f"Line {i}: Markdown bullet point with backticks may cause parsing issues")

        # Check for problematic characters at start of lines (after indentation)
        stripped = line.lstrip()
        if stripped and not re.match(r'^[a-zA-Z0-9\-_:.\s]', stripped):
            issues.append(f"Line {i}: Line starts with potentially problematic character: '{stripped[0]}'")

    return issues


def debug_fluent_output(content, max_lines=10):
    """Debug helper to show problematic lines in generated content."""
    lines = content.split('\n')
    print(f"\nFirst {max_lines} lines of generated content:")
    print("-" * 50)

    for i, line in enumerate(lines[:max_lines], 1):
        # Show special characters visibly
        display_line = line.replace('\t', '→').replace(' ', '·')
        print(f"{i:3}: '{display_line}'")

        # Check for potential issues
        if line.strip().startswith('*'):
            print(f"     ^ WARNING: Line starts with asterisk")
        if '`' in line:
            print(f"     ^ WARNING: Contains backticks")

    if len(lines) > max_lines:
        print(f"... ({len(lines) - max_lines} more lines)")
    print("-" * 50)


def debug_parsing_steps(filepath, parsed_data):
    """Debug helper to show what was extracted during parsing."""
    print(f"\n--- Debug: Parsing steps for {filepath} ---")

    if parsed_data:
        print(f"Command: {parsed_data['command']}")

        print(f"\nUsage ({len(parsed_data['usage'])} lines):")
        for i, usage in enumerate(parsed_data['usage']):
            print(f"  {i+1}: '{usage}'")

        print(f"\nDescription ({len(parsed_data['description'])} chars):")
        if parsed_data['description']:
            # Show first 100 chars
            desc_preview = parsed_data['description'][:100]
            if len(parsed_data['description']) > 100:
                desc_preview += "..."
            print(f"  '{desc_preview}'")
        else:
            print("  (empty)")

        print(f"\nAfter-help ({len(parsed_data['after_help'])} chars):")
        if parsed_data['after_help']:
            # Show first 200 chars and last 100 chars
            help_text = parsed_data['after_help']
            if len(help_text) > 300:
                preview = help_text[:200] + "\n  ...\n  " + help_text[-100:]
            else:
                preview = help_text
            print(f"  '{preview}'")
        else:
            print("  (empty)")
    else:
        print("  Failed to parse!")

    print("--- End debug ---\n")


def process_single_file(input_file, output_file=None):
    """Process a single markdown file."""
    print(f"Processing: {input_file}")

    try:
        parsed = parse_markdown_file(input_file)
        if parsed:
            print(f"  - Command: {parsed['command']}")
            print(f"  - Usage patterns: {len(parsed['usage'])}")
            print(f"  - Description length: {len(parsed['description']) if parsed['description'] else 0}")
            print(f"  - After-help length: {len(parsed['after_help']) if parsed['after_help'] else 0}")

            # Add debugging
            debug_parsing_steps(input_file, parsed)

            fluent_text = convert_to_fluent(parsed)
            if fluent_text:
                # Validate the generated content
                issues = validate_fluent_content(fluent_text)
                if issues:
                    print("  ⚠️  Potential issues found:")
                    for issue in issues:
                        print(f"     {issue}")

                # Debug output for troubleshooting
                debug_fluent_output(fluent_text)

                # Generate output path
                if output_file:
                    output_path = Path(output_file)
                else:
                    output_path = get_output_path(input_file)

                # Create output directory if it doesn't exist
                output_path.parent.mkdir(parents=True, exist_ok=True)

                # Write the fluent file
                with open(output_path, "w", encoding="utf-8") as f:
                    f.write(fluent_text)

                print(f"✓ Output written to: {output_path}")
                return fluent_text
            else:
                print("No content could be extracted.")
        else:
            print("Failed to parse the file.")
    except Exception as e:
        print(f"Error processing file: {e}")
        import traceback
        traceback.print_exc()

    return None


def main():
    parser = argparse.ArgumentParser(
        description="Convert markdown docs to fluent format"
    )
    parser.add_argument("input", help="Input markdown file or directory")
    parser.add_argument(
        "-o",
        "--output",
        help="Output file (only used for single file processing, ignored for directory processing)",
    )

    args = parser.parse_args()

    input_path = Path(args.input)

    if input_path.is_file():
        process_single_file(args.input, args.output)
    elif input_path.is_dir():
        if args.output:
            print(
                "Warning: -o/--output option ignored when processing directories. Files will be generated in their respective locales/ directories."
            )
        process_directory(args.input)
    else:
        print(f"Error: {args.input} is not a valid file or directory")
        return 1

    return 0


if __name__ == "__main__":
    exit(main())

and a second script to update a bunch of the code

LIST=$(rg ".about\(" src/uu/|cut -d: -f1|sort)
echo $LIST
for f in $LIST; do
    p=$(echo $f|cut -d/ -f3)
    echo $p
    sed -i -e "s|.about(ABOUT)|.about(get_message(\"$p-about\"))|" $f
    sed -i -e "s|.format_usage(USAGE)|\(format_usage(\&get_message(\"$p-usage\"))|" $f
    sed -i -e "s|.after_help(AFTER_HELP)|.after_help(get_message(\"$p-after-help\"))|" $f


    sed -i -e "s|const ABOUT: &str = help_about.*|use uucore::locale::{self, get_message};|g" $f
    sed -i -e "s|static const ABOUT.*|use uucore::locale::{self, get_message};|g" $f
    sed -i -e "s|static ABOUT.*|use uucore::locale::{self, get_message};|g" $f

    sed -i -e "/const USAGE/d" $f
    sed -i -e "/const AFTER_HELP/d" $f
done

github-actions · 2025-06-03T18:03:25Z

GNU testsuite comparison:

GNU test failed: tests/basenc/basenc. tests/basenc/basenc is passing on 'main'. Maybe you have to rebase?
GNU test failed: tests/cksum/cksum-base64. tests/cksum/cksum-base64 is passing on 'main'. Maybe you have to rebase?
GNU test failed: tests/env/env-S. tests/env/env-S is passing on 'main'. Maybe you have to rebase?
GNU test failed: tests/factor/factor. tests/factor/factor is passing on 'main'. Maybe you have to rebase?
GNU test failed: tests/misc/comm. tests/misc/comm is passing on 'main'. Maybe you have to rebase?
GNU test failed: tests/mv/diag. tests/mv/diag is passing on 'main'. Maybe you have to rebase?
Skipping an intermittent issue tests/misc/tee (passes in this run but fails in the 'main' branch)
Skipping an intermittent issue tests/timeout/timeout (passes in this run but fails in the 'main' branch)
Congrats! The gnu test tests/cp/link-heap is no longer failing!

github-actions · 2025-06-03T21:35:05Z

GNU testsuite comparison:

GNU test failed: tests/basenc/basenc. tests/basenc/basenc is passing on 'main'. Maybe you have to rebase?
Skipping an intermittent issue tests/misc/tee (passes in this run but fails in the 'main' branch)
Skipping an intermittent issue tests/timeout/timeout (passes in this run but fails in the 'main' branch)
Congrats! The gnu test tests/cp/link-heap is no longer failing!

github-actions · 2025-06-03T22:40:14Z

GNU testsuite comparison:

Congrats! The gnu test tests/cp/link-heap is no longer failing!

src/uu/users/locales/en-US.ftl

src/uu/who/locales/en-US.ftl

sylvestre · 2025-06-04T07:39:02Z

src/uucore/src/lib/lib.rs

@@ -143,6 +143,25 @@ pub fn disable_rust_signal_handlers() -> Result<(), Errno> {
    Ok(())
 }

+pub fn get_canonical_util_name(util_name: &str) -> &str {


yes, it is duplicated code but i don't know how to fix it yet

src/uu/date/src/date.rs

src/uu/dircolors/src/dircolors.rs

github-actions · 2025-06-04T08:32:18Z

GNU testsuite comparison:

Congrats! The gnu test tests/cp/link-heap is no longer failing!

src/uu/du/src/du.rs

cakebaker · 2025-06-04T08:33:21Z

src/uu/env/src/env.rs

-const ABOUT: &str = help_about!("env.md");
-const USAGE: &str = help_usage!("env.md");
-const AFTER_HELP: &str = help_section!("after help", "env.md");
+use uucore::locale::get_message;


I would move the import to the other imports.

src/uu/fold/src/fold.rs

src/uu/groups/src/groups.rs

cakebaker · 2025-06-04T08:35:45Z

src/uu/hashsum/src/hashsum.rs

 const NAME: &str = "hashsum";
-const ABOUT: &str = help_about!("hashsum.md");
-const USAGE: &str = help_usage!("hashsum.md");
+use uucore::locale::get_message;


I would move the import to the other imports.

src/uu/head/src/head.rs

cakebaker · 2025-06-04T08:37:03Z

src/uu/id/src/id.rs

-const ABOUT: &str = help_about!("id.md");
-const USAGE: &str = help_usage!("id.md");
-const AFTER_HELP: &str = help_section!("after help", "id.md");
+use uucore::locale::get_message;


I would move the import to the other imports.

src/uu/install/src/install.rs

cakebaker · 2025-06-04T08:39:01Z

src/uu/ln/src/ln.rs

-const ABOUT: &str = help_about!("ln.md");
-const USAGE: &str = help_usage!("ln.md");
-const AFTER_HELP: &str = help_section!("after help", "ln.md");
+use uucore::locale::get_message;


I would move the import to the other imports.

src/uu/logname/src/logname.rs

src/uu/ls/src/ls.rs

cakebaker · 2025-06-04T08:42:03Z

src/uu/mkdir/src/mkdir.rs

-const ABOUT: &str = help_about!("mkdir.md");
-const USAGE: &str = help_usage!("mkdir.md");
-const AFTER_HELP: &str = help_section!("after help", "mkdir.md");
+use uucore::locale::get_message;


I would move the import to the other imports.

cakebaker · 2025-06-04T08:44:13Z

src/uu/mv/src/mv.rs

-const ABOUT: &str = help_about!("mv.md");
-const USAGE: &str = help_usage!("mv.md");
-const AFTER_HELP: &str = help_section!("after help", "mv.md");
+use uucore::locale::get_message;


I would move the import to the other imports.

cakebaker · 2025-06-04T08:45:10Z

src/uu/nice/src/nice.rs


-const ABOUT: &str = help_about!("nice.md");
-const USAGE: &str = help_usage!("nice.md");
+use uucore::locale::get_message;


I would move the import to the other imports.

cakebaker · 2025-06-04T08:45:58Z

src/uu/nproc/src/nproc.rs


-const ABOUT: &str = help_about!("nproc.md");
-const USAGE: &str = help_usage!("nproc.md");
+use uucore::locale::get_message;


I would move the import to the other imports.

src/uucore/src/lib/mods/locale.rs

src/uu/ls/src/ls.rs

…docs

…feature)

…sable translations

github-actions · 2025-06-04T18:17:29Z

GNU testsuite comparison:

Skipping an intermittent issue tests/misc/tee (passes in this run but fails in the 'main' branch)
Congrats! The gnu test tests/cp/link-heap is no longer failing!

cakebaker · 2025-06-05T07:40:17Z

Good work :)

sylvestre force-pushed the locale3 branch from 165c5ca to 2cb4d39 Compare June 3, 2025 20:48

sylvestre force-pushed the locale3 branch from 2cb4d39 to 606128d Compare June 3, 2025 21:56

sylvestre requested a review from cakebaker June 4, 2025 06:41