make-data-file

#!/usr/bin/perl
##############################################################################
#
# Script for munging Unicode character data into a form suitable for UCF
#
# Uses local copies of:
#
#   https://www.unicode.org/Public/UCD/latest/ucdxml/ucd.nounihan.grouped.zip
#   https://www.unicode.org/Public/UCD/latest/ucdxml/ucd.unihan.grouped.zip
#
# Use --download option to get these files.
#
# Use --help option for info on command line options and details of the output
# file format.
#
# The list of 'General Category' details is hardcoded in the JS frontend, so
# we parse it out from there to ensure both ends are in sync.
#

use 5.014;
use strict;
use warnings;
use autodie;

use Pod::Usage;
use Getopt::Long  qw(GetOptions);
use LWP::Simple   qw(mirror RC_OK RC_NOT_MODIFIED);
use Archive::Zip  qw(:ERROR_CODES :CONSTANTS);
use Archive::Zip::MemberRead qw();

use FindBin;
use XML::SAX::ParserFactory;

my $url_latest_source_dir = 'https://www.unicode.org/Public/UCD/latest/ucdxml';
my $url_pdf_base          = 'https://www.unicode.org/charts/PDF';

my $local_data_dir        = $FindBin::Bin . '/unicode-source-data';

my $source_file_nounihan  = 'ucd.nounihan.grouped.zip';
my $source_file_all       = 'ucd.all.grouped.zip';

my $output_file_nounihan  = $FindBin::Bin . '/htdocs/char-data-nounihan.txt';
my $output_file_all       = $FindBin::Bin . '/htdocs/char-data-all.txt';

my $js_frontend_source    = $FindBin::Bin . '/htdocs/jquery.ucf.js';

my $gcp = '>';  # flag to indicate next char is a GC prefix

# Handle command-line arguments

my(%opt, %args);

if(!GetOptions(\%opt,
    'download|d',
    'char|c=s',
    'help|?',
)) {
    pod2usage(-exitval => 1,  -verbose => 0);
}

pod2usage(-exitstatus => 0, -verbose => 2) if $opt{help};

if($opt{char}) {
    $args{char} = uc($opt{char});
}
elsif($opt{attr}) {
    $args{attr} = $opt{attr};
}


translate_data($source_file_nounihan, $output_file_nounihan);

exit;

sub translate_data {
    my($zip_file, $output_filename) = @_;

    my $xml_ref = unzip_xml($zip_file);

    if(!%args) {
        print "Writing $output_filename\n";
        open my $out, '>', $output_filename;
        $args{out_fh} = $out;
    }
    $args{url_pdf_base} = $url_pdf_base;
    $args{entity_map} = load_entities_files();

    my $handler = UCFHandler->new(%args);
    $handler->load_general_categories($js_frontend_source);

    my $parser  = XML::SAX::ParserFactory->parser(Handler => $handler);

    $parser->parse_string($$xml_ref);

    if($args{out_fh}) {
        close($args{out_fh});
        make_gzip_version($output_filename);
    }

}


sub unzip_xml {
    my($zip_file) = @_;

    my $local_zip_name  = get_zip_file($zip_file);

    my $zip = Archive::Zip->new();
    if($zip->read($local_zip_name) != AZ_OK) {
        die "Error reading $local_zip_name";
    }
    my @members = $zip->membersMatching('[.]xml$');
    if(@members == 0) {
        die "Can't find an XML file in $local_zip_name";
    }
    elsif(@members > 1) {
        die "Can't identify the correct XML file in $local_zip_name";
    }

    my($xml_member) = @members;
    print "Unzipping " . $xml_member->fileName . "\n";
    my $fh = $xml_member->readFileHandle;
    my $buffer;
    my $result = $fh->read($buffer, $xml_member->uncompressedSize);
    return \$buffer;
}


sub get_zip_file {
    my($zip_file) = @_;
    my $url             = $url_latest_source_dir . '/' . $zip_file;
    my $local_zip_name  = $local_data_dir        . '/' . $zip_file;

    if($opt{download}) {
        print "Downloading\nFrom: $url\nTo:   $local_zip_name\n";
        my $rc = mirror($url, $local_zip_name);
        if($rc == RC_NOT_MODIFIED) {
            print "Local copy already up to date.\n";
        }
        elsif($rc != RC_OK) {
            die "Download of $url failed: status = $rc";
        }
    }

    if(! -e $local_zip_name) {
        my $message = "File does not exist:\n  $local_zip_name\n";
        $message .= "Use --download to get the file\n";
        die $message;
    }

    return $local_zip_name;
}


sub load_entities_files {
    my($out) = @_;

    my %map = (
        '0026'  => '&amp;',
        '003C'  => '&lt;',
    );
    foreach my $dtd_file (sort glob("$local_data_dir/*.ent")) {
        open my $fh, '<', $dtd_file or die "open($dtd_file): $!";
        while(<$fh>) {
            if(my($name, $code) = $_ =~ m{<!ENTITY\s+(\w+)\s+"&#(\d+);">}) {
                $code = sprintf('%04X', $code);
                $map{$code} = "&${name};";
            }
        }
    }
    return \%map;
}


sub make_gzip_version {
    my($base_file) = @_;
    my $gz_file = $base_file . '.gz';

    print "Compressing to $gz_file\n";
    system("gzip --best < $base_file > $gz_file");
    system("touch $base_file $gz_file");
}

package UCFHandler;

use Data::Dumper;

sub new {
    my $class = shift;
    return bless {
        @_,
        last_code_point => 0,
        group_attr      => {},
    }, $class;
}

sub load_general_categories {
    my($self, $js_file) = @_;

    # Open JS file and seek to the relevant section
    open my $fh, '<', $js_file;
    $_ = '';
    $_ = <$fh> until /var general_categories_spec/;

    my $i = 0;
    my %gen_cat;
    while(<$fh>) {
        last unless /\S/;   # Bail out of first blank line
        my($line) = $_ =~ m{"([^"]+)\\n"} or next;
        my($code, $category, $extra) = split /\s+=>\s+/, $line or next;
        $gen_cat{$code} = {
            category  => $category,
            extra     => $extra,
            code      => chr( ord('0') + $i++),
        };
    }
    $self->{gen_cat} = \%gen_cat;
}

sub general_category_code {
    my($self, $attr) = @_;
    my $gc = $attr->{gc} // $self->group_attr->{gc}
        // die "No 'gc' for U+$attr->{cp}";
    my $cat = $self->{gen_cat}->{$gc}
        // die "General category code '$gc' is not yet supported";
    my $code = $gcp . $cat->{code};
    return '' if(($self->{last_gc_code} // '') eq $code);
    return $self->{last_gc_code} = $code;
}

sub output {
    my($self, $line) = @_;
    my $out = $self->{out_fh} or return;
    if(my($this_type) = $line =~ /^([#%^!])/) {
        if(my $deferred = delete $self->{deferred}) {
            my($last_type) = $deferred =~ /^([#%^!])/;
            if($this_type eq $last_type) {
                my($this_extent) = $line =~ m{^(?:[+]\d+)?[#%^!](\d+)};
                $line = $deferred;
                $line =~ s{^((?:[+]\d+)?[#%^!])(\d+)}{$1 . ($2 + $this_extent)}e;
            }
            else {
                print $out "$deferred\n";
            }
        }
        $self->{deferred} = $line;
        return;
    }
    elsif(my $deferred = delete $self->{deferred}) {
        print $out "$deferred\n";
    }
    print $out "$line\n";
}

sub url_pdf_base {
    return shift->{url_pdf_base};
}

sub entity_for_cp {
    my($self, $cp) = @_;
    if(my $cp = $self->{entity_map}->{$cp}) {
        return $cp;
    }
    return;
}

sub last_code_point {
    my $self = shift;
    $self->{last_code_point} = shift if @_;
    return $self->{last_code_point};
}

sub offset {
    my($self, $this_cp) = @_;
    my $last_cp = $self->last_code_point;
    if($last_cp > $this_cp) {
        die sprintf('Codepoint %04X out of sequence', $this_cp);
    }
    my $offset = $this_cp - $last_cp;
    return $offset == 1 ? '' : "+$offset";
}

sub group_name_template {
    my $self = shift;
    $self->{group_name_template} = shift if @_;
    return $self->{group_name_template};
}

sub group_is_control_char {
    my $self = shift;
    $self->{group_is_control_char} = shift if @_;
    return $self->{group_is_control_char};
}

sub group_is_pua {
    my $self = shift;
    $self->{group_is_pua} = shift if @_;
    return $self->{group_is_pua};
}

sub is_control_char {
    my $self = shift;
    if(my $attr = $self->char_attr) {
        if(my $gc = $attr->{gc}) {
            return $gc eq 'Cc';
        }
    }
    return $self->group_is_control_char;
}

sub group_attr {
    my $self = shift;
    $self->{group_attr} = shift if @_;
    return $self->{group_attr};
}

sub char_attr {
    my $self = shift;
    $self->{char_attr} = shift if @_;
    return $self->{char_attr};
}

sub add_alias {
    my $self = shift;
    my $attr = $self->char_attr || return;
    my $aliases = $attr->{aliases};
    my $new = shift // return;
    return if $new eq '';
    foreach my $name ($attr->{name}, @$aliases) {
        return if $name eq $new;
        my $no_with = $name =~ s/ WITH / /r;
        return if $no_with eq $new;
    }
    push @$aliases, $new;
}

sub _delegate {
    my($self, $event, $data) = @_;

    my $name = $data->{LocalName};
    $name =~ s/-/_/g;
    my $method = $event . '_' . $name;
    if($self->can($method)) {
        $self->$method($data);
    }
}

sub _attr {
    my($self, $data) = @_;
    return {
        map { $_->{LocalName} => $_->{Value} }
            values %{ $data->{Attributes} }
    };
}

sub start_element {
    my($self, $data) = @_;
    return $self->_delegate('start_element', $data);
}

sub end_element {
    my($self, $data) = @_;
    return $self->_delegate('end_element', $data);
}

sub start_element_description {
    my($self, $data) = @_;
    $self->{description} = '';
}

sub end_element_description {
    my($self, $data) = @_;
    my $description = delete $self->{description};
    if($description =~ m{(\d+[.]\d+[.]\d+)}) {
        $description = $1;
    }
    $self->output("$description");
}

sub characters {
    my($self, $data) = @_;
    return unless exists $self->{description};
    $self->{description} .= $data->{Data};
}

sub start_element_group {
    my($self, $data) = @_;
    my $attr = $self->_attr($data);
    $self->group_attr($attr);
    $self->group_is_control_char(($attr->{gc} // '') eq 'Cc' ? 1 : 0);
    $self->group_name_template($attr->{na});
    $self->group_is_pua(($attr->{blk} // '') =~ /PUA/ ? 1 : 0);
}

sub end_element_group {
    my($self) = @_;
    $self->group_attr({});
}

sub start_element_char {
    my($self, $data) = @_;
    my $attr = $self->_attr($data);
    $attr->{aliases} = [];
    $self->char_attr($attr);
    if($self->is_control_char) {
        $attr->{name} = '<control>';
        $self->add_alias($attr->{na});
    }
    else {
        $attr->{name} = $attr->{na} // $attr->{na1};
        if(!$attr->{name}) {
            if(my $tmpl = $self->group_name_template) {
                return $self->special_range($attr, '#', '<char na="#">', $tmpl);
            }
        }
    }
    if(!$attr->{name}) {
        if(($attr->{blk} || '') =~ /PUA/ or $self->group_is_pua) {
            $self->special_range($attr, '*', '<char blk="PUA">');
        }
        return;
    }
    $self->add_alias($attr->{na1});
    $attr->{is_control} = $self->is_control_char ? 'Yes' : 'No';
}

sub start_element_name_alias {
    my($self, $data) = @_;
    my $attr = $self->_attr($data);
    my $alias = $attr->{alias} // return;
    if(my $type = $attr->{type}) {
        if(lc($type) eq 'correction') {
            $alias .= ' (correction)';
        }
    }
    $self->add_alias($alias);
}

sub start_element_reserved {
    my($self, $data) = @_;
    my $attr = $self->_attr($data);
    $self->special_range($attr, '%', '<reserved>');
}

sub start_element_noncharacter {
    my($self, $data) = @_;
    my $attr = $self->_attr($data);
    $self->special_range($attr, '!', '<noncharacter>');
}

sub start_element_surrogate {
    my($self, $data) = @_;
    my $attr = $self->_attr($data);
    $self->special_range($attr, '^', '<surrogate>');
}

sub special_range {
    my($self, $attr, $prefix, $type, @extra) = @_;
    my $cps = $attr->{'first-cp'} // $attr->{cp};
    my $cpe = $attr->{'last-cp'}  // $attr->{cp};
    die "Can't determine $type range from:\n" . dump_attr($attr) . "\n"
        unless($cps && $cpe);
    my $dcps = hex($cps);
    my $dcpe = hex($cpe);
    my $offset = $self->offset($dcps);
    my $extent = $dcpe - $dcps + 1;
    my $extra = (@extra) ? join("\t", '', @extra) : '';
    $self->output("$offset$prefix$extent$extra");
    $self->last_code_point($dcpe);
}

sub end_element_char {
    my($self, $data) = @_;
    my $attr = $self->char_attr;
    if(!$attr->{cp}) {
        if($attr->{name} and $attr->{name} =~ /#/) {
            $self->special_range($attr, '#', '<char na="#">', $attr->{name});
        }
        return;
    }
    return unless $attr->{name};
    $self->add_unofficial_aliases;
    my $cp   = $attr->{cp};
    my $dcp  = hex($cp);
    my $name = $attr->{name};
    my $aliases = '';
    my $offset = $self->offset($dcp);
    if(@{ $attr->{aliases} }) {
        $aliases = join('; ', @{ $attr->{aliases} });
    }
    if(my $ent = $self->entity_for_cp($cp)) {
        $aliases = $ent . $aliases;
    }
    $aliases = "\t" . $aliases if $aliases ne '';
    my $prefix = $name =~ /^[+"%#^*!\[&]/ ? '"' : '';

    die "$cp name starts with '$gcp'" if $name =~ /^$gcp/;

    # Add a gc prefix if the 'General Category' has changed
    my $gc_code = $self->general_category_code($attr);
    my $gc = $attr->{gc} // $self->group_attr->{gc} // '';
    die "No 'gc' for U+$cp" unless $gc;

    # If it's a combining character, make first letter of name lowercase
    my $ccc = $attr->{ccc} // $self->group_attr->{ccc};
    if($gc eq 'Me' or ($ccc and $ccc ne '0')) {
        die "'$name' does not start with a letter" unless $name =~ /[a-zA-Z]/;
        $name =~ s{^(.)}{ lc($1) }e;
    }
    else {
        $name =~ s{^(.)}{ uc($1) }e;
    }

    $self->output("$offset$prefix$gc_code$name$aliases");
    if(my $target = $self->{char}) {
        return unless $attr->{cp} eq $target;
        print "$cp\t$name$aliases\n";
        print dump_attr($attr), "\n";
        exit;
    }
    $self->last_code_point($dcp);
}

sub start_element_block {
    my($self, $data) = @_;
    my $attr = $self->_attr($data);
    my $first = $attr->{'first-cp'} or return;
    my $last = $attr->{'last-cp'} or return;
    my $name = $attr->{'name'} or return;
    my $file = "U${first}.pdf";
    my $url = $self->url_pdf_base . '/' . $file;
    $self->output("[$first\t$last\t$name\t$file\t$url");
}

sub add_unofficial_aliases {
    my($self) = @_;
    my $attr = $self->char_attr;
    my $aliases = $attr->{aliases};
    my $all_names = join(' ', $attr->{name}, @$aliases);

    if(
            $all_names =~ /\bDIAERESIS\b/i
        and $all_names !~ /\bUMLAUT\b/i
        and $all_names !~ /\bGREEK\b/i
    ) {
        foreach ($attr->{name}, @$aliases) {
            s/\b(DIAERESIS)\b/$1 (OR UMLAUT)/i && last;
        }
    }
}

sub dump_attr {
    my($attr) = @_;

    return join "\n",
        $attr->{cp} ? $attr->{cp} : (),
        map { "  $_: $attr->{$_}" }
        grep { $_ ne 'cp' } sort keys %$attr;
}

1;

__END__

=head1 NAME

make-data-file - generate data file for Unicode Character Finder

=head1 SYNOPSIS

  make-data-file [options] <files>

  Options:

   --download     get a fresh copy of source data if required
   --char <num>   hex codepoint number of character to be dumped
   --help         detailed help message

=head1 DESCRIPTION

This script generates the data file used by the Unicode Character Finder.  It
processes the XML Unicode Character Data file and produces a plain text (tab
separated) data file as output.  The format of the output file is described
below.

=head1 OPTIONS

=over 4

=item B<< --download >> (alias: B<-d>)

Download latest source data .zip files from the Unicode Consortium web site
and store in a local directory.

=item B<< --char <hex-num> >> (alias: B<-c>)

Instead of writing out a data file, dump the attributes of the specified
character on STDOUT.

=item B<--help> (alias: B<-?>)

Display this documentation.

=back

=head1 OUTPUT FILE FORMAT

The role of this script is to read the Unicode Character Database XML and
output a much smaller, plain-text file listing all the characters and block
ranges with a tiny subset of the attributes from the original data.

The file starts with a single line containing the version number of the Unicode
database that was used to generate the file, followed by a series of lines
which describe:

=over 4

=item Characters

one codepoint or a range of codepoints per line

=item Character blocks

one block per line

=item HTML entities

mapping one entity name to one codepoint per line

=back

Lines describing characters start with an offset and are followed by a type
prefix character then tab-separated fields.

The offset starts with a '+' and is followed by one or more digits.  For
example '+10' would imply that the fields that follow describe the character
with a codepoint value 10 more that the one described on the previous line.
The codepoint counter is assumed to be zero initially so the first line would
start with +0 to define character U+0000.  If the offset field is not present,
a value of '+1' is implied.

After the offset is a single punctuation character which identifies the type of
record the remaining fields define.  If this type prefix is missing, a prefix
of '"' is implied.  The type prefixes and the meaning of the fields that follow
are:

=head2 '"' A character

=over 4

=item *

description (may be prefixed with '<x' where 'x' is an index for the general
category of this and following characters; note also: descriptions seem to
always be upper case, but we force the initial letter to lower case to flag
combining characters)

=item *

aliases (may be prefixed by '&name;' if there is a named HTML character entity)

=back

=head2 '#' A range of characters with templated descriptions

=over 4

=item *

number of codepoints in the range

=item *

the description template, in which '#' with be replaced with codepoint

=back

=head2 '%' A reserved character range

=over 4

=item *

number of codepoints in the range

=back

=head2 '^' A range of surrogate codepoints

=over 4

=item *

number of codepoints in the range

=back

=head2 '!' A range of non-characters

=over 4

=item *

number of codepoints in the range

=back

=head2 '*' A range of "private use" characters (a PUA)

=over 4

=item *

number of codepoints in the range

=back

=head2 '[' A block description

=over 4

=item *

codepoint of first character

=item *

codepoint of last character

=item *

block name

=item *

PDF filename

=item *

PDF URL

=back

The '[' lines do not include or imply an offset.

=cut

=head1 AUTHOR AND COPYRIGHT

Copyright (c) 2010-2017 Grant McLean <grant@mclean.net.nz>

This program is free software: you can redistribute it and/or modify it under
the terms of the GNU Affero General Public License as published by the Free
Software Foundation, either version 3 of the License, or (at your option) any
later version.