From 86e0e1ca059087406e9b1fbde5a1e362a4ed276c Mon Sep 17 00:00:00 2001 From: Ulf Hermjakob Date: Mon, 30 Nov 2020 00:08:26 -0800 Subject: [PATCH] v1.3.7 adds better handling for Cyrillic text, minor improvements for em-dash, en-dash, replacement character --- current | 2 +- v1.3.6/README.txt | 4 +- v1.3.7/LICENSE.txt | 10 + v1.3.7/README.txt | 100 + v1.3.7/bin/add-prefix.pl | 9 + v1.3.7/bin/normalize-workset-sentences.pl | 13 + v1.3.7/bin/tokenize-english.pl | 51 + v1.3.7/bin/xml-reader.pl | 112 + v1.3.7/data/BioSplitPatterns.txt | 390 + v1.3.7/data/EnglishAbbreviations.txt | 388 + v1.3.7/lib/NLP/English.pm | 3114 ++ v1.3.7/lib/NLP/SntSegmenter.pm | 98 + v1.3.7/lib/NLP/UTF8.pm | 1353 + v1.3.7/lib/NLP/utilities.pm | 3608 ++ v1.3.7/lib/NLP/xml.pm | 802 + v1.3.7/test/Cancel_Cell_pmid17418411.nxml | 3 + v1.3.7/test/Cancel_Cell_pmid17418411.txt-ref | 287 + v1.3.7/test/amr-general-corpus.tok-ref | 39260 +++++++++++++++++ v1.3.7/test/amr-general-corpus.txt | 39260 +++++++++++++++++ v1.3.7/test/bio-amr-snt.tok-ref | 6952 +++ v1.3.7/test/bio-amr-snt.txt | 6952 +++ v1.3.7/test/tok-challenge.tok-ref | 17 + v1.3.7/test/tok-challenge.txt | 17 + 23 files changed, 102799 insertions(+), 3 deletions(-) create mode 100644 v1.3.7/LICENSE.txt create mode 100755 v1.3.7/README.txt create mode 100755 v1.3.7/bin/add-prefix.pl create mode 100755 v1.3.7/bin/normalize-workset-sentences.pl create mode 100755 v1.3.7/bin/tokenize-english.pl create mode 100755 v1.3.7/bin/xml-reader.pl create mode 100644 v1.3.7/data/BioSplitPatterns.txt create mode 100755 v1.3.7/data/EnglishAbbreviations.txt create mode 100755 v1.3.7/lib/NLP/English.pm create mode 100755 v1.3.7/lib/NLP/SntSegmenter.pm create mode 100755 v1.3.7/lib/NLP/UTF8.pm create mode 100755 v1.3.7/lib/NLP/utilities.pm create mode 100755 v1.3.7/lib/NLP/xml.pm create mode 100644 v1.3.7/test/Cancel_Cell_pmid17418411.nxml create mode 100644 v1.3.7/test/Cancel_Cell_pmid17418411.txt-ref create mode 100644 v1.3.7/test/amr-general-corpus.tok-ref create mode 100644 v1.3.7/test/amr-general-corpus.txt create mode 100644 v1.3.7/test/bio-amr-snt.tok-ref create mode 100644 v1.3.7/test/bio-amr-snt.txt create mode 100644 v1.3.7/test/tok-challenge.tok-ref create mode 100755 v1.3.7/test/tok-challenge.txt diff --git a/current b/current index 90f1642..ab74b63 120000 --- a/current +++ b/current @@ -1 +1 @@ -v1.3.6 \ No newline at end of file +v1.3.7 \ No newline at end of file diff --git a/v1.3.6/README.txt b/v1.3.6/README.txt index 922acdb..a12d598 100755 --- a/v1.3.6/README.txt +++ b/v1.3.6/README.txt @@ -1,5 +1,5 @@ -tok-eng version 1.3.5 -Release date: April 2, 2019 +tok-eng version 1.3.6 +Release date: November 28, 2019 Author: Ulf Hermjakob, USC Information Sciences Institute English tokenizer tokenize-english.pl diff --git a/v1.3.7/LICENSE.txt b/v1.3.7/LICENSE.txt new file mode 100644 index 0000000..c5f6068 --- /dev/null +++ b/v1.3.7/LICENSE.txt @@ -0,0 +1,10 @@ +Copyright (C) 2015-2020 Ulf Hermjakob, USC Information Sciences Institute + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +Any publication of projects using uroman shall acknowledge its use: "This project uses the English tokenizer written by Ulf Hermjakob, USC Information Sciences Institute (2015-2020)". + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + diff --git a/v1.3.7/README.txt b/v1.3.7/README.txt new file mode 100755 index 0000000..5006471 --- /dev/null +++ b/v1.3.7/README.txt @@ -0,0 +1,100 @@ +tok-eng version 1.3.7 +Release date: November 30, 2020 +Author: Ulf Hermjakob, USC Information Sciences Institute + +English tokenizer tokenize-english.pl + +Usage: tokenize-english.pl [--bio] < STDIN + Option --bio is for biomedical domain. + +Example: bin/tokenize-english.pl --bio < test/tok-challenge.txt > test/tok-challenge.tok +Example: bin/tokenize-english.pl --bio < test/bio-amr-snt.txt > test/bio-amr-snt.tok +Example: bin/tokenize-english.pl < test/amr-general-corpus.txt > test/amr-general-corpus.tok + +Tokenizer uses two data files: +(1) List of common English abbreviations (data/EnglishAbbreviations.txt) + e.g. Jan., Mr., Ltd., i.e., fig. in order to keep abbreviation periods + attached to their abbreviations. +(2) List of bio patterns to be split/not split (data/BioSplitPatterns.txt) + e.g. 'SPLIT-DASH-X activated' means that 'P53-activated' should be + split into 'P53 @-@ activated' + e.g. 'DO-NOT-SPLIT up-regulate' that 'up-regulate' should stay together. + +The tokenizer (in --bio mode) includes a few expansions such as + Erk1/2 -> Erk1 @/@ Erk2 + Slac2-a/b/c -> Slac2-a @/@ Slac2-b @/@ Slac2-c +which go beyond tokenization in the strictest sense. + +The tokenizer (in --bio mode) attempts to split compounds of multiple +molecules while keeping together names for single molecules as far as +this is possible without an extensive database of molecule names. +Example: 'ZO-1/ZO-2/ZO-3' -> 'ZO-1 @/@ ZO-2 @/@ ZO-3' + +But without an extensive corpus of molecule names, there are some +limitations in cases such as 'spectrin-F-actin' where heuristics +might suggest us that "F" is an unlikely molecule name, but where +it's not clear from simple surface patterns whether the proper +decomposition is + spectrin @-@ F-actin or + spectrin-F @-@ actin or + spectrin-F-actin. +(Based on biological knowledge, the first alternative is the correct +one, but the tokenizer leaves 'spectrin-F-actin' unsplit.) + +----------------------------------------------------------------- + +Changes in version 1.3.7: +- Better handling of Cyrillic text, especially hyphenated tokens. +- Better handling of some em/en-dashes, replacement character at beginning or end of token. +Changes in version 1.3.5: +- Better treatment of extended Latin (e.g. Lithuanian), Cyrillic scripts +- minor improvements re: km2 &x160; No./No.2 +Changes in version 1.3.4: +- Replace replacement character with original character in some predictable cases. +- Minor incremental improvements/corrections. +Changes in version 1.3.3: +- Various incremental improvements, particularly relating to period splitting. +- Question marks and exclamation marks are separate tokens (as opposed to clusters of question and exclamation marks). + +Changes in version 1.3.2: +- Improved treatment of punctuation, particular odd characters (trademark sign, + British pound sign) and clusters of punctuation. +- Rare xml-similar tags such [QUOTE=...] and [/IMG] +- Split won't -> will n't; ain't -> is n't; shan't -> shall n't; cannot -> can not +- Keep together: ftp://... e.g. ftp://ftp.funet.fi/pub/standards/RFC/rfc959.txt +- Keep together: mailto:... e.g. mailto:ElRushbo@eibnet.com +- Keep together Twitter hashtags and handles e.g. #btw2017 @nimjan_uyghur +- Impact: 4-5% of sentences in general AMR corpus + +----------------------------------------------------------------- + +XML sentence extractor xml-reader.pl + +Usage: xml-reader.pl -i [--pretty [--indent ]] [--html ] [--docid ] [--type {nxml|elsxml|ldcxml}] + is the input file in XML format + --pretty is an option that will cause the output to be XML in "pretty" indented format. + -- index is a suboption to specify the number of space characters per indentation level + --html specifies an optional output file in HTML that displays the output sentences + in a format easily readable (and checkable) by humans + --docid is an optional input; needed in particular if system can't find docid + inside input XML file. + --type {nxml|elsxml} specifies optional special (non-standard) input type (XML variant). + Type will be automatically deduced for filenames ending in .nxml or .elsxml. + +Example: bin/xml-reader.pl -i test/Cancel_Cell_pmid17418411.nxml | bin/normalize-workset-sentences.pl | bin/add-prefix.pl a3_ > test/Cancel_Cell_pmid17418411.txt + Output file test/Cancel_Cell_pmid17418411.txt should match reference file test/Cancel_Cell_pmid17418411.txt-ref + Postprocessing with normalize-workset-sentences.pl and add-prefix.pl a3_ is recommended. (See note below.) +Example: xml-reader.pl -i test/Cancel_Cell_pmid17418411.nxml --pretty --indent 3 +Example: xml-reader.pl -i test/Cancel_Cell_pmid17418411.nxml --html test/Cancel_Cell_pmid17418411.html --docid PMID:17418411 --type nxml + +Auxiliary micro-scripts: + normalize-workset-sentences.pl < STDIN + normalized spaces wrt XML tags xref/title/sec-title. + add-prefix.pl < STDIN + adds prefix at beginning of each line. +It is strongly recommended to use normalize-workset-sentences.pl and add-prefix.pl a3_ +where the a3_-prefix indicates that the segmented sentences have been generated +automatically. This allows fresh sentence IDs in the future for manually corrected +sentence segmentation or improved sentence segmentation without created a sentence ID +conflict. + diff --git a/v1.3.7/bin/add-prefix.pl b/v1.3.7/bin/add-prefix.pl new file mode 100755 index 0000000..f4aa18f --- /dev/null +++ b/v1.3.7/bin/add-prefix.pl @@ -0,0 +1,9 @@ +#!/usr/bin/perl -w +# Author: Ulf Hermjakob +# Created: July 20, 2004 +# Add prefix to lines from stdin + +$prefix = $ARGV[0]; +while () { + print "$prefix$_"; +} diff --git a/v1.3.7/bin/normalize-workset-sentences.pl b/v1.3.7/bin/normalize-workset-sentences.pl new file mode 100755 index 0000000..f328e39 --- /dev/null +++ b/v1.3.7/bin/normalize-workset-sentences.pl @@ -0,0 +1,13 @@ +#!/usr/bin/perl -w +# Author: Ulf Hermjakob + +while(<>) { + s/(]+>)\s*(\[\d+\])\s*(<\/xref>)/$1$2$3/g; + s/(]+>)\s*(\S.*?\S|\S)\s*(<\/title>)/$1$2$3/g; + s/(<sec-title [^<>]+>)\s*(\S.*?\S|\S)\s*(<\/sec-title>)/$1$2$3/g; + s/ +/ /g; + print; +} + +exit 0; + diff --git a/v1.3.7/bin/tokenize-english.pl b/v1.3.7/bin/tokenize-english.pl new file mode 100755 index 0000000..beec3e0 --- /dev/null +++ b/v1.3.7/bin/tokenize-english.pl @@ -0,0 +1,51 @@ +#!/usr/bin/perl -w + +# Author: Ulf Hermjakob +# Written: May 15, 2017 - November 30, 2020 + +# $version = "v1.3.7"; + +$|=1; + +use FindBin; +use Cwd "abs_path"; +use File::Basename qw(dirname); +use File::Spec; + +my $bin_dir = abs_path(dirname($0)); +my $root_dir = File::Spec->catfile($bin_dir, File::Spec->updir()); +my $data_dir = File::Spec->catfile($root_dir, "data"); +my $lib_dir = File::Spec->catfile($root_dir, "lib"); + +use lib "$FindBin::Bin/../lib"; +use NLP::English; +use NLP::utilities; +use NLP::UTF8; +$englishPM = NLP::English; +$control = " "; +$english_abbreviation_filename = File::Spec->catfile($data_dir, "EnglishAbbreviations.txt"); +$bio_split_patterns_filename = File::Spec->catfile($data_dir, "BioSplitPatterns.txt"); +%ht = (); + +while (@ARGV) { + $arg = shift @ARGV; + if ($arg =~ /^-*bio/) { + $control .= "bio "; + } else { + print STDERR "Ignoring unrecognized arg $arg\n"; + } +} + +$englishPM->load_english_abbreviations($english_abbreviation_filename, *ht); +$englishPM->load_split_patterns($bio_split_patterns_filename, *ht); + +while (<>) { + ($pre, $s, $post) = ($_ =~ /^(\s*)(.*?)(\s*)$/); + my $s = $englishPM->tokenize($s, *ht, $control); + $s =~ s/^\s*//; + $s =~ s/\s*$//; + print "$pre$s$post"; +} + +exit 0; + diff --git a/v1.3.7/bin/xml-reader.pl b/v1.3.7/bin/xml-reader.pl new file mode 100755 index 0000000..43da2e3 --- /dev/null +++ b/v1.3.7/bin/xml-reader.pl @@ -0,0 +1,112 @@ +#!/usr/bin/perl -w + +# Author: Ulf Hermjakob +# First written: February 2, 2015 +# Version: 1.3 (May 16, 2017) + +# Usage: xml-reader.pl -i <xml-filename> [--pretty [--indent <n>]] [--html <html-filename>] [--docid <input-docid>] [--type {nxml|elsxml|ldcxml}] +# <xml-filename> is the input file in XML format +# --pretty is an option that will cause the output to be XML in "pretty" indented format. +# -- index <n> is a suboption to specify the number of space characters per indentation level +# --html <html-filename> specifies an optional output file in HTML that displays the output sentences +# in a format easily readable (and checkable) by humans +# --docid <input-docid> is an optional input; needed in particular if system can't find docid +# inside input XML file. +# --type {nxml|elsxml} specifies optional special (non-standard) input type (XML variant). +# Type will be automatically deduced for filenames ending in .nxml or .elsxml. +# Example: bin/xml-reader.pl -i test/Cancel_Cell_pmid17418411.nxml | bin/normalize-workset-sentences.pl | bin/add-prefix.pl a3_ > test/Cancel_Cell_pmid17418411.txt +# Example: xml-reader.pl -i test/Cancel_Cell_pmid17418411.nxml --pretty --indent 3 +# Example: xml-reader.pl -i test/Cancel_Cell_pmid17418411.nxml --html test/Cancel_Cell_pmid17418411.html --docid PMID:17418411 --type nxml + +$|=1; + +use FindBin; +use Cwd "abs_path"; +use File::Basename qw(dirname); +use File::Spec; + +my $bin_dir = abs_path(dirname($0)); +my $root_dir = File::Spec->catfile($bin_dir, File::Spec->updir()); +my $data_dir = File::Spec->catfile($root_dir, "data"); +my $lib_dir = File::Spec->catfile($root_dir, "lib"); + +use lib "$FindBin::Bin/../lib"; +use NLP::utilities; +use NLP::xml; + +$xml = NLP::xml; +%ht = (); +$pretty_print_p = 0; +$xml_in_filename = ""; +$html_out_filename = ""; +$xml_id = "XML1"; +$doc_id = ""; +$workset_name = ""; +$snt_id_core = ""; +$schema = ""; +$indent = 3; +$xml_type = ""; + +while (@ARGV) { + $arg = shift @ARGV; + if ($arg =~ /^-+(pretty|pp)$/) { + $pretty_print_p = 1; + } elsif ($arg =~ /^-+(i|xml)$/) { + $xml_in_filename = shift @ARGV; + $xml_type = "elsxml" if ($xml_type eq "") && ($xml_in_filename =~ /\.elsxml$/); + $xml_type = "nxml" if ($xml_type eq "") && ($xml_in_filename =~ /\.nxml$/); + } elsif ($arg =~ /^-+indent$/) { + $indent = shift @ARGV; + } elsif ($arg =~ /^-+doc[-_]?id$/) { + $doc_id = shift @ARGV; + } elsif ($arg =~ /^-+html$/) { + $html_out_filename = shift @ARGV; + } elsif ($arg =~ /^-+(xml[-_]?type|type)$/) { + $xml_type = shift @ARGV; + } else { + print STDERR "Ignoring unrecognized arg $arg\n"; + } +} + +if ($xml_type eq "elsxml") { + @snts = split(/\n/, $xml->extract_elsxml_paper_snts($xml_in_filename, *ht, $xml_id, $doc_id, $schema)); +} elsif ($xml_type eq "nxml") { + @snts = split(/\n/, $xml->extract_nxml_paper_snts($xml_in_filename, *ht, $xml_id, $doc_id, $schema)); +} elsif ($xml_type eq "ldcxml") { + @snts = split(/\n/, $xml->extract_ldc_snts($xml_in_filename, *ht, $xml_id, $doc_id, $schema)); +} else { + # The following read_xml_file is already included in above extract_...xml_paper_snts + $xml->read_xml_file($xml_in_filename, *ht, $xml_id, $schema); +} + +unless ($doc_id) { + $doc_id = $xml->find_doc_id(*ht, $xml_id, $xml_type, "pmid") + || $xml->find_doc_id(*ht, $xml_id, $xml_type, "pmc") + || $xml->find_doc_id(*ht, $xml_id, $xml_type); +} + +if ($pretty_print_p) { + print $xml->write_xml("1.1", *ht, $xml_id, $schema, $indent); +} else { + die "No doc_id available (neither as argument nor in specified in doc)" unless $doc_id; + $workset_name = lc $doc_id; + $workset_name =~ s/[_:]+/-/g; + $snt_id_core = $workset_name; + $snt_id_core =~ s/-+/_/g; + if ($snt_id_core =~ /\d\d\d\d\d$/) { + $snt_id_core =~ s/(\d\d\d\d)$/_$1/; + } elsif ($snt_id_core =~ /\d[-_.]\d\d\d\d$/) { + $snt_id_core =~ s/[-_.](\d\d\d\d)$/_$1/; + } else { + $snt_id_core .= "_0000"; + } + if ($html_out_filename) { + $n_snt = $xml->write_workset_to_html(*ht, $html_out_filename, $doc_id, $workset_name, $snt_id_core, $schema, @snts); + } else { + $n_snt = $xml->write_workset_as_plain_txt(*ht, *STDOUT, $snt_id_core, @snts); + } + print STDERR "Output $n_snt sentences\n"; +} + +exit 0; + diff --git a/v1.3.7/data/BioSplitPatterns.txt b/v1.3.7/data/BioSplitPatterns.txt new file mode 100644 index 0000000..14ab7dc --- /dev/null +++ b/v1.3.7/data/BioSplitPatterns.txt @@ -0,0 +1,390 @@ +# split cAMP-dependent, insulin-responsive +SPLIT-DASH-X acquired +SPLIT-DASH-X activated +SPLIT-DASH-X activating +SPLIT-DASH-X activation +SPLIT-DASH-X amplified +SPLIT-DASH-X associated +SPLIT-DASH-X based +SPLIT-DASH-X bearing +SPLIT-DASH-X binding +SPLIT-DASH-X bound +SPLIT-DASH-X catalyzed +SPLIT-DASH-X conditioned +SPLIT-DASH-X consuming +SPLIT-DASH-X containing +SPLIT-DASH-X coupled +SPLIT-DASH-X deficient +SPLIT-DASH-X dependent +SPLIT-DASH-X depleted +SPLIT-DASH-X deprived +SPLIT-DASH-X derived +SPLIT-DASH-X designated +SPLIT-DASH-X driven +SPLIT-DASH-X enhanced +SPLIT-DASH-X enhancement +SPLIT-DASH-X enhances +SPLIT-DASH-X enhancing +SPLIT-DASH-X expressed +SPLIT-DASH-X expresses +SPLIT-DASH-X expressing +SPLIT-DASH-X expression +SPLIT-DASH-X free +SPLIT-DASH-X high +SPLIT-DASH-X independent +SPLIT-DASH-X induced +SPLIT-DASH-X inducing +SPLIT-DASH-X inducible +SPLIT-DASH-X infected +SPLIT-DASH-X inhibiting +SPLIT-DASH-X inhibitor +SPLIT-DASH-X inhibitory +SPLIT-DASH-X injected +SPLIT-DASH-X interacting +SPLIT-DASH-X intermediately +SPLIT-DASH-X labeled +SPLIT-DASH-X labelled +SPLIT-DASH-X loving +SPLIT-DASH-X mediated +SPLIT-DASH-X mutant +SPLIT-DASH-X mutated +SPLIT-DASH-X negative +SPLIT-DASH-X phosphorylated +SPLIT-DASH-X phosphorylation +SPLIT-DASH-X positive +SPLIT-DASH-X promoting +SPLIT-DASH-X old +SPLIT-DASH-X regulated +SPLIT-DASH-X related +SPLIT-DASH-X resistance +SPLIT-DASH-X resistant +SPLIT-DASH-X responsive +SPLIT-DASH-X responsiveness +SPLIT-DASH-X rich +SPLIT-DASH-X selective +SPLIT-DASH-X sensitive +SPLIT-DASH-X shaped +SPLIT-DASH-X signaling +SPLIT-DASH-X specific +SPLIT-DASH-X stimulated +SPLIT-DASH-X stimulating +SPLIT-DASH-X sufficient +SPLIT-DASH-X tagged +SPLIT-DASH-X transfected +SPLIT-DASH-X transformed +SPLIT-DASH-X treated +SPLIT-DASH-X ubiquitinated +SPLIT-DASH-X ubiquitination + +# split quantities, e.g. a three-day experiment +SPLIT-DASH-X second +SPLIT-DASH-X minute +SPLIT-DASH-X min +SPLIT-DASH-X hour +SPLIT-DASH-X day +SPLIT-DASH-X week +SPLIT-DASH-X wk +SPLIT-DASH-X month +SPLIT-DASH-X year +SPLIT-DASH-X foot +SPLIT-DASH-X inch +SPLIT-DASH-X kilometer +SPLIT-DASH-X meter +SPLIT-DASH-X mile +SPLIT-DASH-X early +SPLIT-DASH-X mid +SPLIT-DASH-X late + +# three-fold, four-times +SPLIT-DASH-X fold +SPLIT-DASH-X times + +SPLIT-X-DASH ubiquitinated +SPLIT-X-DASH give +SPLIT-X-DASH mid +SPLIT-X-DASH post +SPLIT-X-DASH pre +SPLIT-X-DASH sub +SPLIT-X-DASH 's +SPLIT-X-DASH to +SPLIT-X-DASH foot +SPLIT-X-DASH inch +SPLIT-X-DASH kilometer +SPLIT-X-DASH meter +SPLIT-X-DASH mile + +DO-NOT-SPLIT-DASH-X alpha +DO-NOT-SPLIT-DASH-X alfa +DO-NOT-SPLIT-DASH-X beta +DO-NOT-SPLIT-DASH-X gamma +DO-NOT-SPLIT-DASH-X kappa + +DO-NOT-SPLIT-DASH-X acetate +DO-NOT-SPLIT-DASH-X acid +DO-NOT-SPLIT-DASH-X actin +DO-NOT-SPLIT-DASH-X actinin +DO-NOT-SPLIT-DASH-X antichymotrypsin +DO-NOT-SPLIT-DASH-X antitrypsin +DO-NOT-SPLIT-DASH-X arabinofuransylcytosine +DO-NOT-SPLIT-DASH-X arrestin +DO-NOT-SPLIT-DASH-X benzyloxycarbonyl +DO-NOT-SPLIT-DASH-X cadherin +DO-NOT-SPLIT-DASH-X catenin +DO-NOT-SPLIT-DASH-X chlorophenyl +DO-NOT-SPLIT-DASH-X cyclin +DO-NOT-SPLIT-DASH-X deoxyuridine +DO-NOT-SPLIT-DASH-X enolase +DO-NOT-SPLIT-DASH-X erbbeta +DO-NOT-SPLIT-DASH-X estradiol +DO-NOT-SPLIT-DASH-X ethynyl +DO-NOT-SPLIT-DASH-X fluoromethylketone +DO-NOT-SPLIT-DASH-X galactosidase +DO-NOT-SPLIT-DASH-X integrin +DO-NOT-SPLIT-DASH-X isomerase +DO-NOT-SPLIT-DASH-X kinase +DO-NOT-SPLIT-DASH-X kinases +DO-NOT-SPLIT-DASH-X macroglobulin +DO-NOT-SPLIT-DASH-X myb +DO-NOT-SPLIT-DASH-X myc +DO-NOT-SPLIT-DASH-X myosin +DO-NOT-SPLIT-DASH-X orthophosphate +DO-NOT-SPLIT-DASH-X phosphatase +DO-NOT-SPLIT-DASH-X proteasome +DO-NOT-SPLIT-DASH-X sialyltransferase +DO-NOT-SPLIT-DASH-X sorafenib +DO-NOT-SPLIT-DASH-X sulfanylbutanamide +DO-NOT-SPLIT-DASH-X tegrin +DO-NOT-SPLIT-DASH-X trypsin +DO-NOT-SPLIT-DASH-X tubulin + +DO-NOT-SPLIT-X-DASH Caco +DO-NOT-SPLIT-X-DASH NCI +DO-NOT-SPLIT-X-DASH NF +DO-NOT-SPLIT-X-DASH TM + +DO-NOT-SPLIT B-cell +DO-NOT-SPLIT C-terminal +DO-NOT-SPLIT C-termini +DO-NOT-SPLIT C-terminus +DO-NOT-SPLIT CA-RAS +DO-NOT-SPLIT COOH-terminal +DO-NOT-SPLIT COOH-terminally +DO-NOT-SPLIT COOH-termini +DO-NOT-SPLIT COOH-terminus +DO-NOT-SPLIT co-activate +DO-NOT-SPLIT co-activated +DO-NOT-SPLIT co-activates +DO-NOT-SPLIT co-activation +DO-NOT-SPLIT co-activator +DO-NOT-SPLIT co-administer +DO-NOT-SPLIT co-administered +DO-NOT-SPLIT co-administers +DO-NOT-SPLIT co-administration +DO-NOT-SPLIT co-culture +DO-NOT-SPLIT co-cultured +DO-NOT-SPLIT co-exist +DO-NOT-SPLIT co-existed +DO-NOT-SPLIT co-exposed +DO-NOT-SPLIT co-exposure +DO-NOT-SPLIT co-express +DO-NOT-SPLIT co-expresses +DO-NOT-SPLIT co-expressing +DO-NOT-SPLIT co-expression +DO-NOT-SPLIT co-immunoprecipitate +DO-NOT-SPLIT co-immunoprecipitated +DO-NOT-SPLIT co-immunoprecipitates +DO-NOT-SPLIT co-immunoprecipitation +DO-NOT-SPLIT co-inhibit +DO-NOT-SPLIT co-inhibited +DO-NOT-SPLIT co-inhibits +DO-NOT-SPLIT co-inhibition +DO-NOT-SPLIT co-occur +DO-NOT-SPLIT co-occurring +DO-NOT-SPLIT co-operation +DO-NOT-SPLIT co-operate +DO-NOT-SPLIT co-operates +DO-NOT-SPLIT co-repress +DO-NOT-SPLIT co-repression +DO-NOT-SPLIT co-repressor +DO-NOT-SPLIT co-stimulation +DO-NOT-SPLIT co-transfection +DO-NOT-SPLIT co-treatment +DO-NOT-SPLIT de-phosphorylate +DO-NOT-SPLIT de-phosphorylated +DO-NOT-SPLIT de-phosphorylates +DO-NOT-SPLIT de-phosphorylation +DO-NOT-SPLIT double-strand +DO-NOT-SPLIT double-stranded +DO-NOT-SPLIT down-modulate +DO-NOT-SPLIT down-modulated +DO-NOT-SPLIT down-modulates +DO-NOT-SPLIT down-modulation +DO-NOT-SPLIT down-regulation +DO-NOT-SPLIT down-regulate +DO-NOT-SPLIT down-regulated +DO-NOT-SPLIT down-regulates +DO-NOT-SPLIT E-box +DO-NOT-SPLIT F-box +DO-NOT-SPLIT first-line +DO-NOT-SPLIT Ha-RAS +DO-NOT-SPLIT half-life +DO-NOT-SPLIT HD-LM2 +DO-NOT-SPLIT hetero-dimer +DO-NOT-SPLIT hetero-dimerization +DO-NOT-SPLIT hetero-dimers +DO-NOT-SPLIT in-vitro +DO-NOT-SPLIT long-term +DO-NOT-SPLIT Ki-RAS +DO-NOT-SPLIT knock-down +DO-NOT-SPLIT N-terminal +DO-NOT-SPLIT N-termini +DO-NOT-SPLIT N-terminus +DO-NOT-SPLIT NH2-terminal +DO-NOT-SPLIT NH2-terminally +DO-NOT-SPLIT NH2-termini +DO-NOT-SPLIT NH2-terminus +DO-NOT-SPLIT over-express +DO-NOT-SPLIT over-expressed +DO-NOT-SPLIT over-expressing +DO-NOT-SPLIT over-expression +DO-NOT-SPLIT pA-Raf +DO-NOT-SPLIT pB-Raf +DO-NOT-SPLIT pC-Raf +DO-NOT-SPLIT pull-down +DO-NOT-SPLIT real-time +DO-NOT-SPLIT RT-PCR +DO-NOT-SPLIT single-strand +DO-NOT-SPLIT single-stranded +DO-NOT-SPLIT short-term +DO-NOT-SPLIT T-bet +DO-NOT-SPLIT T-cell +DO-NOT-SPLIT T-lymphocyte +DO-NOT-SPLIT up-regulate +DO-NOT-SPLIT up-regulated +DO-NOT-SPLIT up-regulates +DO-NOT-SPLIT up-regulation +DO-NOT-SPLIT wild-type +DO-NOT-SPLIT X-box +DO-NOT-SPLIT Z-disc + +DO-NOT-SPLIT C/EBP +DO-NOT-SPLIT C/EBPdelta +DO-NOT-SPLIT E/T +DO-NOT-SPLIT P/P +DO-NOT-SPLIT S/T-P +DO-NOT-SPLIT S/TP +DO-NOT-SPLIT T/C + +DO-NOT-SPLIT 8-Br-cAMP +DO-NOT-SPLIT 8-Cl-cAMP +DO-NOT-SPLIT 14-3-3 +DO-NOT-SPLIT 1-on-1 + +SPLIT anti-apoptotic +SPLIT anti-inflammatory +SPLIT anti-proliferative +SPLIT anti-tumor +SPLIT anti-tumour +SPLIT cell-cell +SPLIT fast-growing +SPLIT full-length +SPLIT growth-inhibition +SPLIT high-frequency +SPLIT high-grade +SPLIT high-level +SPLIT late-onset +SPLIT low-density +SPLIT low-frequency +SPLIT low-grade +SPLIT lung-tumor +SPLIT near-complete +SPLIT non-autopsy +SPLIT non-canonical +SPLIT non-codon +SPLIT non-grey +SPLIT non-neoplastic +SPLIT non-polarized +SPLIT non-radioactive +SPLIT non-responding +SPLIT non-small +SPLIT non-tumor +SPLIT non-tumour +SPLIT one-half +SPLIT one-third +SPLIT post-coitum +SPLIT post-confluency +SPLIT post-confluent +SPLIT post-hypoxic +SPLIT post-injection +SPLIT post-injectional +SPLIT post-stimulation +SPLIT post-transcriptional +SPLIT post-transcription +SPLIT post-transfection +SPLIT post-transfectional +SPLIT post-translation +SPLIT post-translational +SPLIT pro-apoptotic +SPLIT pro-inflammatory +SPLIT sex-determination +SPLIT small-cell +SPLIT three-dimensional +SPLIT tumor-educated +SPLIT tumour-educated +SPLIT two-dimensional +SPLIT ultra-low + +SPLIT a-second +SPLIT a-minute +SPLIT an-hour +SPLIT a-day +SPLIT a-week +SPLIT a-month +SPLIT a-year +SPLIT a-dozen +SPLIT a-million +SPLIT a-billion +SPLIT a-trillion +SPLIT a-third +SPLIT a-quarter + +SPLIT twenty-one +SPLIT twenty-two +SPLIT twenty-three +SPLIT twenty-four +SPLIT twenty-five +SPLIT twenty-six +SPLIT twenty-seven +SPLIT twenty-eight +SPLIT twenty-nine + +SPLIT thirty-one +SPLIT thirty-two +SPLIT thirty-three +SPLIT thirty-four +SPLIT thirty-five +SPLIT thirty-six +SPLIT thirty-seven +SPLIT thirty-eight +SPLIT thirty-nine + +SPLIT forty-one +SPLIT forty-two +SPLIT forty-three +SPLIT forty-four +SPLIT forty-five +SPLIT forty-six +SPLIT forty-seven +SPLIT forty-eight +SPLIT forty-nine + +SPLIT fifty-one +SPLIT fifty-two +SPLIT fifty-three +SPLIT fifty-four +SPLIT fifty-five +SPLIT fifty-six +SPLIT fifty-seven +SPLIT fifty-eight +SPLIT fifty-nine + diff --git a/v1.3.7/data/EnglishAbbreviations.txt b/v1.3.7/data/EnglishAbbreviations.txt new file mode 100755 index 0000000..180b0b8 --- /dev/null +++ b/v1.3.7/data/EnglishAbbreviations.txt @@ -0,0 +1,388 @@ +Jan. :: January +Feb. :: February +Febr. :: February +Mar. :: March +Apr. :: April +Jun. :: June +Jul. :: July +Aug. :: August +Sep. :: September +Sept. :: September +Oct. :: October +Nov. :: November +Dec. :: December + +Amb. :: Ambassador +Br. :: Brother +Dr. :: Doctor +Eng. :: Engineer +Fr. :: Father +Gov. :: Governor +Hon. :: Honorable +Ing. :: Ingenieur :: Ingeniero +Ir. :: Ingenieur +Jr. :: Junior +Messrs. :: Messieurs +Mr. :: Mister +Mrs. :: Misses +Ms. +Pres. :: President +Prof. :: Professor +Rep. :: Representative +Rev. :: Reverend +Sen. :: Senator +Sr. :: Senior +St. :: Saint +Spt. :: Superintendent +Sup. :: Superintendent +Supt. :: Superintendent + +Adm. :: Admiral +Brig. :: Brigadier +Brig.Gen. :: Brigadier General +Brig.-Gen.:: Brigadier General +Capt. :: Captain +Cpt. :: Captain +Col. :: Colonel +Gen. :: General +Lt. :: Lieutenant +Lt.Gen. :: Lieutenant General +Lt.-Gen. :: Lieutenant General +Maj. :: Major +Maj.Gen. :: Major General +Maj.-Gen. :: Major General +Sgt. :: Sergeant + +Ret. :: Retired + +Co. :: Company +Corp. :: Corporation +Inc. :: Incorporated +LLC :: Limited Liability Company +Ltd. :: Limited +N.V. :: Naamloze vennootschap +S.p.A. :: Società per Azioni + +A.D. :: anno Domini +a.m. :: ante meridiem +al. :: alii +B.C. :: before Christ +B.S. :: bullshit +etc. :: et cetera :: and other things +cf. :: confer :: compare +ed. :: editor +e.g. :: exempli gratia :: for example +et al. :: et alii :: et aliae :: and others +i.e. :: id est :: that is +Mt. :: Mount +p.m. :: post meridiem +p.s. :: post scriptum +S.E. :: standard error +U.K. :: United Kingdom +U.N. :: United Nations +vs. :: versus +yrs. :: years + +B.P. :: blood pressure +s.c. :: subcutaneous +s.d. :: stable disease + +art. :: article +arts. :: articles +chap. :: chapter +chaps. :: chapters +fig. :: figure +Fig. :: figure +op. :: opus +p. :: page +para. :: paragraph +paras. :: paragraphs +pp. :: pages +# No. :: number +Nos. :: numbers +sect. :: section +sects. :: sections +vol. :: volume +vols. :: volumes + +Ala. :: Alabama +Ariz. :: Arizona +Ark. :: Arkansas +Calif. :: California +Colo. :: Colorado +Conn. :: Connecticut +D.C. :: District of Columbia +Del. :: Delaware +Fla. :: Florida +Ga. :: Georgia +Ill. :: Illinois +Ind. :: Indiana +Kans. :: Kansas +Ky. :: Kentucky +La. :: Louisiana +Mass. :: Massachusetts +Md. :: Maryland +Mich. :: Michigan +Minn. :: Minnesota +Miss. :: Mississippi +Mo. :: Missouri +Mont. :: Montana +Nebr. :: Nebraska +Nev. :: Nevada +N.C. :: North Carolina +N.D. :: North Dakota +N.H. :: New Hampshire +N.J. :: New Jersey +N.M. :: New Mexico +N.Y. :: New York +Okla. :: Oklahoma +Ore. :: Oregon +Pa. :: Pennsylvania +P.R. :: Puerto Rico +R.I. :: Rhode Island +S.C. :: South Carolina +S.D. :: South Dakota +Tenn. :: Tennessee +Tex. :: Texas +U.S. :: United States +U.S.A. :: United States of America +Va. :: Virginia +V.I. :: Virgin Islands +Vt. :: Vermont +Wash. :: Washington +W.Va. :: West Virginia +Wis. :: Wisconsin +Wyo. :: Wyoming + +CA :: California +GA :: Georgia +NY :: New York +TX :: Texas + +ISI :: Information Sciences Institute +MIT :: Massachusetts Institute of Technology +UCLA :: University of California, Los Angeles +USC :: University of Southern California +USPS :: United States Postal Service + +AF :: Afghanistan +AL :: Albania +AT :: Austria +AU :: Australia +BA :: Bosnia and Herzegovina +BD :: Bangladesh +BE :: Belgium +BG :: Bulgaria +BR :: Brazil +CA :: Canada +CG :: Republic of Congo +CH :: Switzerland +CN :: China +CR :: Costa Rica +CU :: Cuba +CV :: Cape Verde +DE :: Germany +DK :: Denmark +DZ :: Algeria +EE :: Estonia +EG :: Egypt +ES :: Spain +FR :: France +GB :: Great Britain +GB :: United Kingdom +GE :: Georgia +GH :: Ghana +HK :: Hong Kong +HN :: Honduras +HR :: Croatia +ID :: Indonesia +IN :: India +IQ :: Iraq +IR :: Iran +IR :: Islamic Republic of Iran +IL :: Israel +IT :: Italy +JO :: Jordan +JP :: Japan +KE :: Kenya +KG :: Kyrgyzstan +KH :: Cambodia +KP :: Democratic People's Republic of Korea +KP :: North Korea +KR :: Republic of Korea +KR :: South Korea +KZ :: Kazakhstan +LA :: Laos +LA :: Lao People's Democratic Republic +LY :: Libya +LY :: Libyan Arab Jamahiriya +MA :: Morocco +MC :: Monaco +MK :: Macedonia +MK :: Former Yugoslav Republic of Macedonia +MM :: Myanmar +MR :: Mauritania +MX :: Mexico +MY :: Malaysia +NI :: Nicaragua +NL :: Netherlands +PA :: Panama +PE :: Peru +PH :: Philippines +PK :: Pakistan +PS :: Palestine +PS :: Palestinian Territory +RO :: Romania +RU :: Russia +RU :: Russian Federation +QA :: Qatar +SA :: Saudi Arabia +SE :: Sweden +SG :: Singapore +SK :: Slovakia +SY :: Syria +SY :: Syrian Arab Republic +TH :: Thailand +TJ :: Tajikistan +TM :: Turkmenistan +TN :: Tunisia +TW :: Taiwan +TZ :: Tanzania +UA :: Ukraine +UAE :: United Arab Emirates +UK :: Britain +UK :: Great Britain +UK :: United Kingdom +US :: United States +US :: United States of America +USA :: United States +USA :: United States of America +UZ :: Uzbekistan +VN :: Vietnam +VN :: Viet Nam +YE :: Yemen +ZA :: South Africa +ZW :: Zimbabwe + +A&E :: accident and emergency department +AAM :: air-to-air missile +ABC :: American Broadcasting Corporation +ABC :: Australian Broadcasting Corporation +AC :: air conditioning +AC :: alternating current +ADL :: Anti-Defamation League +ASEAN :: Association of Southeast Asian Nations +ATM :: automated teller machine +BNP :: British National Party +CEO :: chief executive officer +CCTV :: closed-circuit television +CCTV :: China Central Television +CEO :: chief executive officer +CFE :: Conventional Forces in Europe +CFO :: chief financial officer +CPU :: central processing unit +CS :: computer science +DC :: District of Columbia +DD :: dear daughter +DDoS :: distributed denial of service +DH :: dear husband +DNA :: deoxyribonucleic acid +DS :: dear son +DW :: dear wife +EMT :: epithelial-mesenchymal transition +ESA :: European Space Agency +EU :: European Union +GCC :: Golf Cooperation Council +GOP :: Republic Party # US: Grand Old Party +IAEA :: International Atomic Energy Agency +ICBM :: intercontinental ballistic missile +IED :: improvised explosive device +IP :: intellectual property +IP :: internet protocol +ISS :: International Space Station +IT :: internet protocol +IT :: intellectual property +JI :: Jemaah Islamiyah +LA :: Los Angeles +MM :: multiple myeloma +MP :: member of parliament # UK +MP :: military police +NAFTA :: North American Free Trade Agreement +NCRI :: National Council of Resistance of Iran +NATO :: North Atlantic Treaty Organization +NGO :: non-governmental organization +NPT :: Non-Proliferation Treaty +NYT :: New York Times +OP :: original poster +OSCE :: Organization for Security and Cooperation in Europe +OWS :: Occupy Wall Street +PC :: personal computer +PC :: politically correct +PKK :: Kurdistan Workers' Party +POW :: prisoner of war +RMB :: renminbi +RNA :: ribonucleic acid +RPG :: rocket-propelled grenade +TNT :: trinitrotoluene +UAE :: United Arab Emirates +UK :: United Kingdom +UN :: United Nations +UNDCP :: United Nations Drug Control Program +UNODC :: United Nations Office for Drugs and Crime +US :: United States +USA :: United States of America +WHO :: World Health Organization +WMD :: weapons of mass destruction +WSJ :: Wall Street Journal +WT :: wild-type +WTO :: World Trade Organization +WWI :: World War I +WWII :: World War II + +cm :: centimeter +g :: gram +h :: hour +hrs :: hour +kg :: kilogram +km :: kilometer +M :: molar +mg :: milligram +min :: minute +ml :: milliliter +mm :: millimeter +mph :: miles per hour +µm :: micrometer +µM :: micromolar +ng :: nanogram +nM :: nanomolar + +ad :: advertisement +asap :: as soon as possible +bd :: bluray disc +bro :: brother +cd :: compact disc +dna :: deoxyribonucleic acid +dvd :: digital versatile disc +exec :: executive +execs :: executives +flu :: influenza +lib :: liberal +lol :: laughing out loud +mag :: magazine +math :: mathematics +maths :: mathematics +pic :: picture +pvc :: polyvinyl chloride +rna :: ribonucleic acid +Ser :: serine +sis :: sister +tg :: transgenic +Tg :: transgenic +TV :: television +USMC :: United States Marine Corps +wt :: wild-type + +d. :: day +g. :: gram +m. :: meter + diff --git a/v1.3.7/lib/NLP/English.pm b/v1.3.7/lib/NLP/English.pm new file mode 100755 index 0000000..0873c02 --- /dev/null +++ b/v1.3.7/lib/NLP/English.pm @@ -0,0 +1,3114 @@ +################################################################ +# # +# English # +# # +################################################################ + +package NLP::English; + +use File::Basename; +use File::Spec; + +# tok v1.3.7 (November 30, 2020) + +$chinesePM = NLP::Chinese; +$ParseEntry = NLP::ParseEntry; +$util = NLP::utilities; +$utf8 = NLP::UTF8; +$logfile = ""; +# $logfile2 = (-d "/nfs/isd/ulf/smt/agile") ? "/nfs/isd/ulf/smt/agile/minilog" : ""; +# $util->init_log($logfile2); + +$currency_symbol_list = "\$|\xC2\xA5|\xE2\x82\xAC|\xE2\x82\xA4"; +$english_resources_skeleton_dir = ""; +%dummy_ht = (); + +sub build_language_hashtables { + local($caller, $primary_entity_style_filename, $data_dir) = @_; + + unless ($data_dir) { + $default_data_dir = "/nfs/nlg/users/textmap/brahms-ml/arabic/bin/modules/NLP"; + $data_dir = $default_data_dir if -d $default_data_dir; + } + my $english_word_filename = "$data_dir/EnglishWordlist.txt"; + my $default_entity_style_MT_filename = "$data_dir/EntityStyleMT-zh.txt"; + my $entity_style_all_filename = "$data_dir/EntityStyleAll.txt"; + my $EnglishNonNameCapWords_filename = "$data_dir/EnglishNonNameCapWords.txt"; + $english_resources_skeleton_dir = "$data_dir/EnglishResources/skeleton"; + %english_annotation_ht = (); + %annotation_english_ht = (); + %english_ht = (); + $CardinalMaxWithoutComma = 99999; + $CardinalMaxNonLex = 9999000; + + $primary_entity_style_filename = $default_entity_style_MT_filename unless defined($primary_entity_style_filename); + if ($primary_entity_style_filename =~ /^(ar|zh)$/) { + $languageCode = $primary_entity_style_filename; + $primary_entity_style_filename + = File::Spec->catfile($data_dir, "EntityStyleMT-$languageCode.txt"); + } + + open(IN,$english_word_filename) || die "Can't open $english_word_filename"; + while (<IN>) { + next unless $_ =~ /^s*[^#\s]/; # unless blank/comment line + $_ =~ s/\s+$//; + $line = $_; + @lines = ($line); + if (($line =~ /::gpe:/) + && (($annotation) = ($line =~ /^.*?::(.*)$/)) + && (($pre_annotation, $singular_english, $post_annotation) = ($annotation =~ /^(.*)::plural-of:([^:]+)(|::.*)\s*$/))) { + $derived_annotation = $singular_english . "::$pre_annotation$post_annotation"; + # print STDERR "derived_annotation: $derived_annotation\n"; + push(@lines, $derived_annotation); + } + foreach $line (@lines) { + ($english,@slots) = split("::",$line); + next unless defined($english); + $english =~ s/\s+$//; + $lc_english = $english; + $lc_english =~ tr/[A-Z]/[a-z]/; + $annotation = "::" . join("::",@slots) . "::"; + $english_annotation_ht{$english} = $annotation; + $english_annotation_ht{$lc_english} = $annotation; + $english_annotation_ht{"_ALT_"}->{$english}->{$annotation} = 1; + $english_annotation_ht{"_ALT_"}->{$lc_english}->{$annotation} = 1; + $synt = ""; + foreach $slot_value (@slots) { + ($slot,$value) = ($slot_value =~ /\s*(\w[^:]+):\s*(\S.*)$/); + next unless defined($value); + $slot =~ s/\s+$//; + $value =~ s/\s+$//; + $synt = $value if $slot eq "synt"; + if (defined($annotation_english_ht{$slot_value})) { + push(@{$annotation_english_ht{$slot_value}},$english); + } else { + my @elist = ($english); + $annotation_english_ht{$slot_value} = \@elist; + } + if ($synt && defined($slot_value) && ($slot ne "synt")) { + $annot = "synt:$synt" . "::$slot_value"; + if (defined($annotation_english_ht{$annot})) { + push(@{$annotation_english_ht{$annot}},$english); + } else { + my @elist = ($english); + $annotation_english_ht{$annot} = \@elist; + } + $english_annotation_ht{"_EN_SYNT_"}->{$english}->{$synt}->{$slot} = $value; + } + } + } + } + close(IN); + + if (open(IN,$EnglishNonNameCapWords_filename)) { + while (<IN>) { + next unless $_ =~ /^s*[^#\s]/; # unless blank/comment line + $_ =~ s/\s+$//; + $english_ht{(lc $_)}->{COMMON_NON_NAME_CAP} = 1; + } + close(IN); + } else { + print STDERR "Can't open $EnglishNonNameCapWords_filename\n"; + } + + foreach $style ("primary", "all") { + if ($style eq "primary") { + $entity_style_filename = $primary_entity_style_filename || $default_entity_style_MT_filename; + } elsif ($style eq "all") { + $entity_style_filename = $entity_style_all_filename; + } else { + next; + } + %ht = (); + open(IN,$entity_style_filename) || die("Can't open $entity_style_filename (stylefile)"); + my $n_entries = 0; + while (<IN>) { + next unless $_ =~ /^s*[^#\s]/; # unless blank/comment line + $_ =~ s/\s+$//; + ($slot,$value_string) = ($_ =~ /^([^:]+):\s*(\S.*)$/); + next unless defined($value_string); + if (defined($ht{$slot})) { + print STDERR "Warning: ignoring duplicate entry for $slot in $entity_style_filename\n"; + next; + } + @values = split("::", $value_string); + foreach $value (@values) { + $value =~ s/^\s+//g; + $value =~ s/\s+$//g; + } + my @values_copy = @values; + $ht{$slot} = \@values_copy; + $n_entries++; + } + # print STDERR "Processed $n_entries entries in $entity_style_filename\n"; + close(IN); + if ($style eq "primary") { + %english_entity_style_ht = %ht; + } elsif ($style eq "all") { + %english_entity_style_all_ht = %ht; + } + } + + if (defined($raw = $english_entity_style_ht{CardinalMaxWithoutComma}) + && (@styles = @{$raw}) && ($n = $styles[0]) && ($n =~ /^\d+$/) && ($n >= 999)) { + $CardinalMaxWithoutComma = $n; + } + if (defined($raw = $english_entity_style_ht{CardinalMaxNonLex}) + && (@styles = @{$raw}) && ($n = $styles[0]) && ($n =~ /^\d+$/) && ($n >= 999999)) { + $CardinalMaxNonLex = $n; + } + + return (*english_annotation_ht,*annotation_english_ht,*english_entity_style_ht); +} + +sub read_language_variations { + local($this, $filename, *ht) = @_; + + my $n = 0; + my $line_number = 0; + if (open(IN, $filename)) { + while (<IN>) { + $line_number++; + $us = $util->slot_value_in_double_colon_del_list($_, "us"); + $uk = $util->slot_value_in_double_colon_del_list($_, "uk"); + $formal = $util->slot_value_in_double_colon_del_list($_, "formal"); + $informal = $util->slot_value_in_double_colon_del_list($_, "informal"); + if ($us && $uk) { + $ht{VARIATION_UK_US}->{$uk}->{$us} = 1; + $n++; + } + if ($informal && $formal) { + $ht{VARIATION_INFORMAL_FORMAL}->{$informal}->{$formal} = 1; + $n++; + } + } + close(IN); + # print STDERR "Read $n spelling variation entries from $filename\n"; + } +} + +sub entity_style_listing { + local($caller,$attr) = @_; + + if (defined($l = $english_entity_style_ht{$attr})) { + @sl = @{$l}; + if (($#sl == 0) && ($sl[0] eq "all")) { + if (defined($al = $english_entity_style_all_ht{$attr})) { + return @{$al}; + } else { + return (); + } + } else { + return @sl; + } + } else { + return (); + } +} + +sub is_abbreviation { + local($caller,$noun) = @_; + + $result = defined($annotation_s = $english_annotation_ht{$noun}) + && ($annotation_s =~ /::abbreviation:true::/); +# print "is_abbreviation($noun): $result\n"; + return $result; +} + +sub noun_adv_sem { + local($caller,$noun) = @_; + + return "" unless defined($annotation_s = $english_annotation_ht{$noun}); + ($adv_sem) = ($annotation_s =~ /::adv_sem:([-_a-z]+)::/); + return "" unless defined($adv_sem); + return $adv_sem; +} + +sub numeral_value { + local($caller,$numeral) = @_; + + return "" unless defined($annotation_s = $english_annotation_ht{$numeral}); + ($value) = ($annotation_s =~ /::value:(\d+)::/); + return "" unless defined($value); + return $value; +} + +sub annot_slot_value { + local($caller,$lex, $slot) = @_; + + return "" unless defined($annotation_s = $english_annotation_ht{$lex}); + ($value) = ($annotation_s =~ /::$slot:([-_a-z]+)(?:::.*|)\s*$/i); + return "" unless defined($value); + return $value; +} + +sub annot_slot_values { + local($caller,$lex, $slot) = @_; + + return () unless @annotations = keys %{$english_annotation_ht{"_ALT_"}->{$lex}}; + @annot_slot_values = (); + foreach $annotation_s (@annotations) { + ($value) = ($annotation_s =~ /::$slot:([^:]+)(?:::.*|)\s*$/i); + if (defined($value)) { + $value =~ s/\s*$//; + push(@annot_slot_values, $value); + } + } + return @annot_slot_values; +} + +# quick and dirty +sub noun_number_form { + local($caller,$noun,$number) = @_; + + $noun = "rupee" if $noun =~ /^Rs\.?$/; + $noun = "kilometer" if $noun =~ /^km$/; + $noun = "kilogram" if $noun =~ /^kg$/; + $noun = "meter" if $noun =~ /^m$/; + $noun = "second" if $noun =~ /^(s|secs?\.?)$/; + $noun = "minute" if $noun =~ /^(mins?\.?)$/; + $noun = "hour" if $noun =~ /^(h|hrs?\.?)$/; + $noun = "year" if $noun =~ /^(yrs?\.?)$/; + $noun = "degree" if $noun =~ /^(deg\.?)$/; + $noun = "foot" if $noun =~ /^(feet|ft\.?)$/; + $noun = "square kilometer" if $noun =~ /^sq\.? km/; + $noun =~ s/metre$/meter/; + $noun =~ s/litre$/liter/; + $noun =~ s/gramme$/gram/; + $noun =~ s/tonne$/ton/; + return $noun if $noun =~ /\$$/; + return $noun unless $number =~ /^[0-9.]+$/; + return $noun if $util->member($noun,"percent"); # no change in plural + return $noun if $noun =~ /\b(yuan|renminbi|RMB|rand|won|yen|ringgit|birr)$/; # no change in plural + return $noun if $number <= 1; + + return $noun if $caller->is_abbreviation($noun); + + $noun =~ s/^(hundred|thousand|million|billion|trillion)\s+//; + return $noun if $noun =~ /^(dollar|kilometer|pound|ton|year)s$/i; + + $original_noun = $noun; + #check for irregular plural + $annot = "synt:noun::plural-of:$noun"; + if (defined($annotation_english_ht{$annot})) { + @elist = @{$annotation_english_ht{$annot}}; + return $elist[0] if @elist; + } + + $noun = $noun . "s"; + return $noun if $noun =~ /(a|e|o|u)ys$/; # days, keys, toys, guys + $noun =~ s/ys$/ies/; # babies + $noun =~ s/ss$/ses/; # buses + $noun =~ s/xs$/xes/; # taxes + $noun =~ s/shs$/shes/; # dishes + $noun =~ s/chs$/ches/; # churches + $noun =~ s/mans$/men/; # women + # print STDERR "NNF: $original_noun($number): $noun\n"; + return $noun; +} + +# quick and dirty +sub lex_candidates { + local($caller,$surf) = @_; + + @lex_cands = ($surf); + $lex_cand = $surf; + $lex_cand =~ s/ies$/y/; + push(@lex_cands,$lex_cand) unless $util->member($lex_cand, @lex_cands); + $lex_cand = $surf; + $lex_cand =~ s/s$//; + push(@lex_cands,$lex_cand) unless $util->member($lex_cand, @lex_cands); + $lex_cand = $surf; + $lex_cand =~ s/es$//; + push(@lex_cands,$lex_cand) unless $util->member($lex_cand, @lex_cands); + $lex_cand = $surf; + $lex_cand =~ s/\.$//; + push(@lex_cands,$lex_cand) unless $util->member($lex_cand, @lex_cands); + $lex_cand = $surf; + $lex_cand =~ s/men$/man/; + push(@lex_cands,$lex_cand) unless $util->member($lex_cand, @lex_cands); + + return @lex_cands; +} + +# quick and dirty +sub pos_tag { + local($caller,$surf) = @_; + + return CD if ($surf =~ /^-?[0-9,\.]+$/); + return NN if ($surf =~ /^($currency_symbol_list\d)/); + @lex_candidates = $caller->lex_candidates($surf); +# print " lex_candidates: @lex_candidates\n"; + foreach $lex_cand (@lex_candidates) { + if (defined($annotation_s = $english_annotation_ht{$lex_cand})) { +# print " annotation: $annotation_s\n"; + ($synt) = ($annotation_s =~ /::synt:([^:]+)::/); + if (defined($synt)) { + if ($synt eq "art") { + return "DT"; + } elsif ($synt eq "adj") { + ($grade) = ($annotation_s =~ /::grade:([^:]+)::/); + if (defined($grade) && ($grade eq "superlative")) { + return "JJS"; + } elsif (defined($grade) && ($grade eq "comparative")) { + return "JJR"; + } else { + return "JJ"; + } + } elsif ($synt eq "noun") { + if ($lex_cand eq $surf) { + return "NN"; + } else { + return "NNS"; + } + } elsif ($synt eq "name") { + return "NNP"; + } elsif ($synt eq "cardinal") { + return "CD"; + } elsif ($synt eq "ordinal") { + return "JJ"; + } elsif ($synt eq "prep") { + return "IN"; + } elsif ($synt eq "conj") { + return "CC"; + } elsif ($synt eq "wh_pron") { + return "WP"; + } elsif ($synt eq "adv") { + return "RB"; + } elsif ($synt eq "genetive_particle") { + return "POS"; + } elsif ($synt eq "ordinal_particle") { + return "NN"; + } elsif ($synt eq "suffix_particle") { + return "NN"; + } elsif ($synt =~ /^int(erjection)?$/) { + return "UH"; + } elsif (($synt =~ /^punctuation$/) + && $util->is_rare_punctuation_string_p($surf)) { + return "SYM"; + } elsif ($synt =~ /\bverb$/) { + if ($surf =~ /^(is)$/) { + return "VBZ"; + } else { + return "VB"; + } + } + } + } + } + return ""; +} + +sub indef_art_filter { + local($caller,$surf) = @_; + + # check article in lexical annotation + # e.g. hour::synt:noun::unit:temporal::indef-article:an + # uniform::synt:noun::indef-article:a + ($surf_article,$word) = ($surf =~ /^(an?) (\S+)\s*/); + if (defined($surf_article) + && defined($word) + && defined($annotation = $english_annotation_ht{$word})) { + ($ann_article) = ($annotation =~ /::indef-article:([^:]+)::/); + if (defined($ann_article)) { + return ($surf_article eq $ann_article) ? $surf : ""; + } + } + return "" if $surf =~ /\ban [bcdfghjklmnpqrstvwxyz]/; + return "" if $surf =~ /\ban (US)\b/; + return "" if $surf =~ /\ba [aeio]/; + return "" if $surf =~ /\ba (under)/; + return $surf; +} + +sub wordlist_synt { + local($caller,$word) = @_; + + return "" unless defined($annotation = $english_annotation_ht{$word}); + ($synt) = ($annotation =~ /::synt:([^:]+)::/); + return $synt || ""; +} + +sub qualifier_filter { + local($caller,$surf) = @_; + + return "" if $surf =~ /\b(over|more than|approximately) (million|billion|trillion)/; + return "" if $surf =~ /\b(over) (once|twice)/; + return $surf; +} + +sub quantity_filter { + local($caller,$surf) = @_; + + return "" if $surf =~ /^(a|an)-/; # avoid "the a-week meeting" + return $surf; +} + +sub value_to_english { + local($caller,$number) = @_; + + $result = ""; + + $annot = "value:$number"; + if (defined($annotation_english_ht{$annot})) { + @elist = @{$annotation_english_ht{$annot}}; + $result = $elist[0] if @elist; + } +# print "value_to_english($number)=$result\n"; + return $result; +} + +sub value_to_english_ordinal { + local($caller,$number) = @_; + + $result = ""; + + $annot = "synt:ordinal::value:$number"; + if (defined($annotation_english_ht{$annot})) { + @elist = @{$annotation_english_ht{$annot}}; + $result = $elist[0] if @elist; + } else { + $annot = "value:$number"; + if (defined($annotation_english_ht{$annot})) { + @elist = @{$annotation_english_ht{$annot}}; + $cardinal = $elist[0] if @elist; + $result = $cardinal . "th"; + $result =~ s/yth$/ieth/; + } + } +# print "value_to_english($number)=$result\n"; + return $result; +} + +sub english_with_synt_slot_value { + local($caller, $english, $synt, $slot) = @_; + + return $english_annotation_ht{"_EN_SYNT_"}->{$english}->{$synt}->{$slot}; +} + +sub english_with_synt_slot_value_defined { + local($caller, $synt, $slot) = @_; + + @englishes_with_synt_slot_value_defined = (); + foreach $english (keys %{$english_annotation_ht{"_EN_SYNT_"}}) { + push(@englishes_with_synt_slot_value_defined, $english) + if defined($english_annotation_ht{"_EN_SYNT_"}->{$english}->{$synt}->{$slot}) + && ! $util->member($english, @englishes_with_synt_slot_value_defined) + } + return @englishes_with_synt_slot_value_defined; +} + +sub number_composed_surface_form { + local($caller,$number,$leave_num_section_p) = @_; + + return "" unless $number =~ /^\d+$/; + $leave_num_section_p = 0 unless defined($leave_num_section_p); + $anchor = "1000000000000000000000000"; + while (($number < $anchor) && ($anchor >= 1000000)) { + $anchor =~ s/000//; + } +# print "number_composed_surface_form number: $number anchor:$anchor\n"; + return "" unless $anchor >= 1000000; + return "" unless $english = $caller->value_to_english($anchor); + $ending = $anchor; + $ending =~ s/^1000//; + return "" unless ($number =~ /$ending$/) || (($number * 1000) % $anchor) == 0; + $num_section = $number / $anchor; + if (($num_section =~ /^[1-9]0?$/) && ! $leave_num_section_p) { + $num_section_english = $caller->value_to_english($num_section); + $num_section = $num_section_english if $num_section_english; + } + $num_section = $caller->commify($num_section); # only for extremely large numbers + return "$num_section $english"; +} + +sub de_scientify { + local($caller,$number) = @_; + +# print "de_scientify: $number\n"; + if ($number =~ /[eE][-+]/) { + ($n,$exp) = ($number =~ /^(\d+)[eE]\+(\d+)$/); + if (defined($exp)) { + $result = $n; + foreach $i (0 .. $exp-1) { + $result .= "0" + } + return $result; + } else { + ($n,$f,$exp) = ($number =~ /^(\d+)\.(\d+)[eE]\+(\d+)$/); + if (defined($exp) && ($exp >= length($f))) { + $result = "$n$f"; + foreach $i (0 .. $exp-1-length($f)) { + $result .= "0"; + } + return $result; + } + } + } + return $number; +} + +sub commify { + local($caller,$number) = @_; + + my $text = reverse $number; + $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; + return scalar reverse $text; +} + +my %plural_rough_number_ht = ( + 10 => "tens", + 12 => "dozens", + 20 => "scores", + 100 => "hundreds", + 1000 => "thousands", + 10000 => "tens of thousands", + 100000 => "hundreds of thousands", + 1000000 => "millions", + 10000000 => "tens of millions", + 100000000 => "hundreds of millions", + 1000000000 => "billions", + 10000000000 => "tens of billions", + 100000000000 => "hundreds of billions", + 1000000000000 => "trillions", + 10000000000000 => "tens of trillions", + 100000000000000 => "hundreds of trillions", +); + +sub plural_rough_plural_number { + local($caller,$number) = @_; + + return $plural_rough_number_ht{$number} || ""; +} + +my %roman_numeral_ht = ( + "I" => 1, + "II" => 2, + "III" => 3, + "IIII" => 4, + "IV" => 4, + "V" => 5, + "VI" => 6, + "VII" => 7, + "VIII" => 8, + "VIIII" => 9, + "IX" => 9, + "X" => 10, + "XX" => 20, + "XXX" => 30, + "XXXX" => 40, + "XL" => 40, + "L" => 50, + "LX" => 60, + "LXX" => 70, + "LXXX" => 80, + "LXXXX" => 90, + "XC" => 90, + "C" => 100, + "CC" => 200, + "CCC" => 300, + "CCCC" => 400, + "CD" => 400, + "D" => 500, + "DC" => 600, + "DCC" => 700, + "DCCC" => 800, + "DCCCC" => 900, + "CM" => 900, + "M" => 1000, + "MM" => 2000, + "MMM" => 3000, + "MMM" => 3000, +); + +sub roman_numeral_value { + local($caller,$s) = @_; + + if (($m, $c, $x, $i) = ((uc $s) =~ /^(M{0,3})(C{1,4}|CD|DC{0,4}|CM|)(X{1,4}|XL|LX{0,4}|XC|)(I{1,4}|IV|VI{0,4}|IX|)$/)) { + $sum = ($roman_numeral_ht{$m} || 0) + + ($roman_numeral_ht{$c} || 0) + + ($roman_numeral_ht{$x} || 0) + + ($roman_numeral_ht{$i} || 0); + return $sum; + } else { + return 0; + } +} + +sub number_surface_forms { + local($caller,$number,$pe) = @_; + + print STDERR "Warning from number_surface_forms: $number not a number\n" + if $logfile && !($number =~ /^(\d+(\.\d+)?|\.\d+)$/); + # $util->log("number_surface_forms number:$number", $logfile); + # $util->log(" surf:$surf", $logfile) if $surf = ($pe && $pe->surf); + + $pe = "" unless defined($pe); + + @num_style_list = @{$english_entity_style_ht{"FollowSourceLanguageNumberStyle"}}; + $follow_num_style = $util->member("yes", @num_style_list) + && (! (($number =~ /^([1-9]|10)$/) && + $util->member("except-small-numbers", @num_style_list))); + $num_style = ($pe) ? $pe->get("num_style") : ""; + if ($follow_num_style) { + if ($num_style =~ /digits_plus_alpha/) { + if ($number =~ /^[1-9]\d?\d?000$/) { + $digital_portion = $number; + $digital_portion =~ s/000$//; + return ("$digital_portion thousand"); + } elsif ($number =~ /^[1-9]\d?\d?000000$/) { + $digital_portion = $number; + $digital_portion =~ s/000000$//; + return ("$digital_portion million"); + } elsif ($number =~ /^[1-9]\d?\d?000000000$/) { + $digital_portion = $number; + $digital_portion =~ s/000000000$//; + return ("$digital_portion billion"); + } + } elsif ($num_style eq "digits") { + if ($number =~ /^\d{1,4}$/) { + return ($number); + } + } + } + + $number = $caller->de_scientify($number); + + $composed_form = $caller->number_composed_surface_form($number); + $composed_form2 = $caller->number_composed_surface_form($number,1); + $lex_form = $caller->value_to_english($number); + $commified_form = $caller->commify($number); + + if ($lex_form) { + if ($number >= 1000000) { + @result = ("one $lex_form", "1 $lex_form", "a $lex_form", $lex_form, $commified_form); + push(@result, $commified_form) if ($number <= $CardinalMaxNonLex); + } elsif ($number >= 100) { + @result = ($commified_form, "one $lex_form", "a $lex_form", $lex_form); + } elsif ($number >= 10) { + @result = ($number, $lex_form); + } elsif ($number == 1) { + @result = ("a", "an", $lex_form); + } elsif ($number == 0) { + @result = ($number, $lex_form); + } else { + @result = ($lex_form); + } + } elsif ($composed_form) { + if ($composed_form eq $composed_form2) { + @result = ($composed_form); + } elsif (($number >= 10000000) && ($composed_form2 =~ /^[1-9]0/)) { + @result = ($composed_form2, $composed_form); + } else { + @result = ($composed_form, $composed_form2); + } + push(@result, $commified_form) if $number <= $CardinalMaxNonLex; + } else { + ($ten,$one) = ($number =~ /^([2-9])([1-9])$/); + ($hundred) = ($number =~ /^([1-9])00$/) unless defined($one); + ($thousand) = ($number =~ /^([1-9]\d?)000$/) unless defined($one) || defined($hundred); + if (defined($one) && defined($ten) + && ($part1 = $caller->value_to_english($ten * 10)) + && ($part2 = $caller->value_to_english($one))) { + $wordy_form = "$part1-$part2"; + @result = ($commified_form, $wordy_form); + } elsif (defined($hundred) + && ($part1 = $caller->value_to_english($hundred))) { + $wordy_form = "$part1 hundred"; + @result = ($commified_form, $wordy_form); + } elsif (defined($thousand) + && ($part1 = $caller->value_to_english($thousand))) { + $wordy_form = "$part1 thousand"; + @result = ($commified_form, $wordy_form); + } elsif ($number =~ /^100000$/) { + @result = ($commified_form, "one hundred thousand", "a hundred thousand", "hundred thousand"); + } elsif ($pe && ($pe->surf eq $number) && ($number =~ /^\d\d\d\d(\.\d+)?$/)) { + @result = ($number); + push(@result, $commified_form) unless $commified_form eq $number; + } elsif ($number =~ /^\d{4,5}$/) { + if ($commified_form eq $number) { + @result = ($number); + } else { + @result = ($commified_form, $number); + } + } else { + @result = ($commified_form); + } + } + push (@result, $number) + unless $util->member($number, @result) || ($number > $CardinalMaxWithoutComma); +# $util->log("number_surface_forms result:@result", $logfile); + + # filter according to num_style + if ($follow_num_style) { + my @filtered_result = (); + foreach $r (@result) { + push(@filtered_result, $r) + if (($num_style eq "digits") && ($r =~ /^\d+$/)) + || (($num_style eq "alpha") && ($r =~ /^[-\@ a-z]*$/i)) + || (($num_style eq "digits_plus_alpha") && ($r =~ /\d.*[a-z]/i)); + } + @result = @filtered_result if @filtered_result; + } + + if ($pe && $pe->childGloss("and")) { + @new_result = (); + foreach $r (@result) { + if ($r =~ /^and /) { + push(@new_result, $r); + } else { + push(@new_result, "and $r"); + } + } + @result = @new_result; + } + return @result; +} + +sub number_range_surface_forms { + local($caller,$pe) = @_; + + $value = $pe->value; + $value_coord = $pe->get("value-coord"); + unless ($value_coord) { + return $caller->number_surface_forms($value); + } + $prefix = ""; + if ($conj = $pe->get("conj")) { + $connector = $conj; + } else { + $connector = ($value_coord == $value + 1) ? "or" : "to"; + } + if ($pe->get("between")) { + $prefix = "between "; + $connector = "and"; + } + + $pe1 = $pe->child("head"); + $pe2 = $pe->child("coord"); + @result1 = $caller->number_surface_forms($value, $pe1); + @result2 = $caller->number_surface_forms($value_coord, $pe2); + @num_style_list = @{$english_entity_style_ht{"FollowSourceLanguageNumberStyle"}}; + $follow_num_style = 1 if $util->member("yes", @num_style_list); + + # between two thousand and three thousand => between two and three thousand + # 3 million to 5 million => 3 to 5 million + if ($follow_num_style && ($#result1 == 0) && ($#result2 == 0)) { + $range = $prefix . $result1[0] . " $connector " . $result2[0]; + $util->log(" range1: $range", $logfile); + $gazillion = "thousand|million|billion|trillion"; + ($a,$gaz1,$b,$gaz2) = ($range =~ /^(.+) ($gazillion) ($connector .+) ($gazillion)$/); + if (defined($a) && defined($gaz1) && defined($b) && defined($gaz2) && ($gaz1 eq $gaz2)) { + $range = "$a $b $gaz1"; + $util->log(" range2: $range", $logfile); + return ($range); + } + } + + @result = (); + foreach $result1 (@result1) { + next if ($value >= 1000) && ($result1 =~ /^\d+$/); + foreach $result2 (@result2) { + next if $result1 =~ /^an?\b/; + push(@result, "$prefix$result1 $connector $result2") + if ($result1 =~ /^[a-z]+$/) && ($result2 =~ /^[a-z]+$/); + next if ($result1 =~ /^[a-z]/) || ($result2 =~ /^[a-z]/); + next if ($value_coord >= 1000) && ($result2 =~ /^\d+$/); + ($digits1,$letters1) = ($result1 =~ /^(\d+(?:.\d+)?) ([a-z].*)$/); + ($digits2,$letters2) = ($result2 =~ /^(\d+(?:.\d+)?) ([a-z].*)$/); + if (defined($digits1) && defined($letters1) + && defined($digits2) && defined($letters2) + && ($letters1 eq $letters2)) { + push(@result, "$prefix$digits1 $connector $digits2 $letters1"); + } elsif (($result1 =~ /^\d{1,3}$/) && ($result2 =~ /^\d{1,3}$/) && !$prefix) { + push(@result, "$result1-$result2"); + if ($connector eq "to") { + my $span = "$result1 to $result2"; + push(@result, $span) unless $util->member($span, @result); + } + } else { + push(@result, "$prefix$result1 $connector $result2"); + } + } + } + unless (@result) { + $result1 = (@result1) ? $result1[0] : $value; + $result2 = (@result2) ? $result2[0] : $value_coord; + @result = "$prefix$result1 $connector $result2"; + } + return @result; +} + +sub q_number_surface_forms { + local($caller,$pe) = @_; + + $surf = $pe->surf; + return ($pe->gloss) unless $value = $pe->value; + if (($value >= 1961) && ($value <= 2030) + && + (($pe->get("struct") eq "sequence of digits") + || + ($surf =~ /^\d+$/))) { + $value = "$prefix $value" if $prefix = $pe->get("prefix"); + @result = ("$value"); + } else { + @result = $caller->number_surface_forms($value,$pe); + @result = $caller->qualify_entities($pe,@result); + } + return @result; +} + +sub ordinal_surface_forms { + local($caller,$number,$exclude_cardinals_p,$exclude_adverbials_p, $pe) = @_; + + if (defined($os = $english_entity_style_ht{"Ordinal"})) { + @ordinal_styles = @{$os}; + } else { + return (); + } + $exclude_cardinals_p = 0 unless defined($exclude_cardinals_p); + @num_style_list = @{$english_entity_style_ht{"FollowSourceLanguageNumberStyle"}}; + $follow_num_style = 1 if $util->member("yes", @num_style_list); + $num_style = ($pe) ? $pe->get("num_style") : ""; + $alpha_ok = ! ($follow_num_style && ($num_style =~ /^digits$/)); + my $c_number = $caller->commify($number); + my $lex_form = ""; + $lex_form = $caller->value_to_english_ordinal($number) if $alpha_ok; + my $adverbial_form + = (($number =~ /^\d+$/) && ($number >= 1) && ($number <= 10) + && $lex_form && $util->member("secondly", @ordinal_styles)) + ? $lex_form . "ly" : ""; + my $num_form = $caller->numeric_ordinal_form($number); + my $c_num_form = $caller->numeric_ordinal_form($c_number); + my @result = (); + +# print "lex_form: $lex_form num_form:$num_form c_num_form:$c_num_form\n"; + if ($lex_form && $util->member("second", @ordinal_styles)) { + if (! $util->member("2nd", @ordinal_styles)) { + @result = ($lex_form); + } elsif ($c_num_form ne $num_form) { + @result = ($c_num_form, $lex_form, $num_form); + } elsif ($number >= 10) { + @result = ($num_form, $lex_form); + } else { + @result = ($lex_form, $num_form); + } + } elsif ($util->member("2nd", @ordinal_styles)) { + if ($c_num_form ne $num_form) { + @result = ($c_num_form, $num_form); + } else { + @result = ($num_form); + } + } + unless ($number =~ /^\d+$/) { + print STDERR "Warning: $number not an integer (for ordinal)\n"; + } + unless ($exclude_cardinals_p) { + $incl_num_card = $util->member("2", @ordinal_styles); + $incl_lex_card = $util->member("two", @ordinal_styles); + foreach $card ($caller->number_surface_forms($number)) { + if ($card =~ /^an?$/) { + # don't include + } elsif ($card =~ /^[0-9,]+$/) { + push(@result, $card) if $incl_num_card; + } else { + push(@result, $card) if $incl_lex_card && $alpha_ok; + } + } + } + push(@result,$adverbial_form) if $adverbial_form && ! $exclude_adverbials_p; + push(@result, $num_form) unless @result; + return @result; +} + +sub ordinal_surface_form { + local($caller,$number,$exclude_cardinals_p,$exclude_adverbials_p, $pe) = @_; + + my @surf_forms = $caller->ordinal_surface_forms($number,$exclude_cardinals_p,$exclude_adverbials_p, $pe); + return (@surf_forms) ? $surf_forms[0] : $caller->numeric_ordinal_form($number); +} + +sub fraction_surface_forms { + local($caller,$pe,$modp) = @_; + + my @result = (); + $numerator = $pe->get("numerator"); + $denominator = $pe->get("denominator"); +# print "numerator: $numerator denominator:$denominator\n"; + @surf_nums = $caller->number_surface_forms($numerator,$pe); + @surf_nums = ("one") if $numerator == 1; + @surf_dens = $caller->ordinal_surface_forms($denominator,1,1); + @surf_dens = ("half") if $denominator == 2; + @surf_dens = ("quarter") if $denominator == 4; + @surf_dens = ("tenth") if $denominator == 10; +# print "surf_nums: @surf_nums surf_dens: @surf_dens\n"; + @fraction_patterns = @{$english_entity_style_ht{"Fraction"}}; + if (@surf_nums && @surf_dens) { + $surf_num = $surf_nums[0]; + $surf_den = $surf_dens[0]; + $surf_num_den = ""; + foreach $sd (@surf_dens) { + $surf_num_den = $sd if $sd =~ /^\d/; + } + $surf_den_w_proper_number = $caller->noun_number_form($surf_den, $numerator); + foreach $fp (@fraction_patterns) { + if ($fp eq "one tenth") { + push(@result, $surf_num . " " . $surf_den_w_proper_number) unless $modp; + } elsif ($fp eq "one-tenth") { + if ($modp) { + push(@result, $surf_num . "-" . $surf_den); + } else { + push(@result, $surf_num . "-" . $surf_den_w_proper_number); + } + } elsif ($fp eq "1/10") { + push(@result, $numerator . "/" . $denominator); + } elsif ($fp eq "1/10th") { + push(@result, $numerator . "/" . $surf_num_den) if $surf_num_den; + } + } + return @result; + } else { + return ($pe->gloss); + } +} + +sub currency_surface_forms { + local($caller,$pe) = @_; + + @currency_surf_forms = (); + return @currency_surf_forms unless $pe->sem =~ /monetary quantity/; + $unit = $pe->get("unit"); + return ($pe->gloss) unless $quant = $pe->get("quant"); + return ($pe->gloss) if $pe->childSem("head") eq "currency symbol"; + $quant_pe = $pe->child("quant"); + if ($unit =~ /^(US|Hongkong) dollar$/) { + @units = $caller->entity_style_listing($unit); + } elsif ($unit eq "yuan") { + @units = $caller->entity_style_listing("Chinese yuan"); + @rmb_pos = @{$english_entity_style_ht{"Chinese RMB position"}}; + @rmb_pos = ("before-number", "after-number") if $util->member("all",@units); + } else { + @units = ($unit); + } + if (($pe->sem =~ /range$/) && $quant_pe) { + @quants = $caller->number_range_surface_forms($quant_pe); + } else { + @quants = $caller->number_surface_forms($quant, $quant_pe); + } + @quants = ($quant) unless @quants; + # print STDERR "units: @units \n"; + foreach $q (@quants) { + foreach $u_sing (@units) { + $u = ($modp) ? $u_sing : $caller->noun_number_form($u_sing, $quant); +# print " q: $q unit: $u value: $quant\n"; + if ($u eq "RMB") { + if ($util->member("before-number", @rmb_pos)) { + if ($q =~ /^\d/) { + push(@currency_surf_forms, "RMB" . $q); + } + } + if ($util->member("after-number", @rmb_pos)) { + push(@currency_surf_forms, $q . " RMB"); + } + } elsif ($u =~ /\$$/) { + if ($q =~ /^\d/) { + $currency_surf_form = $u . $q; + push(@currency_surf_forms, $currency_surf_form); + } + } else { + $new_form = "$q $u"; + push(@currency_surf_forms, $new_form) if $caller->indef_art_filter($new_form); + } + } + } + @currency_surf_forms = $caller->qualify_entities($pe,@currency_surf_forms); + + # print STDERR "currency_surface_forms: @currency_surf_forms \n"; + return @currency_surf_forms; +} + +sub age_surface_forms { + local($caller,$pe, $modp) = @_; + + $gloss = $pe->gloss; + @age_surf_forms = (); + return @age_surf_forms unless $pe->sem =~ /age quantity/; + $unit = $pe->get("unit"); + return ($gloss) unless $quant = $pe->get("quant"); + $temporal_quant_pe = $pe->child("head"); + $synt = $pe->synt; + if ($synt =~ /parenthetical/) { + if ($pe->get("slashed")) { + @age_markers = $caller->entity_style_listing("ParentheticalAgeFormatSlashed"); + @age_markers = $caller->entity_style_listing("ParentheticalAgeFormat") unless @age_markers; + } else { + @age_markers = $caller->entity_style_listing("ParentheticalAgeFormat"); + } + return ($gloss) unless @age_markers; + foreach $a (@age_markers) { + $age_surf_form = $a; + $age_surf_form =~ s/8/$quant/; + push(@age_surf_forms, $age_surf_form); + } + } elsif (($quant =~ /^\d+$/) && ($temporal_quant_pe->sem eq "age unit")) { + @quants = $caller->number_surface_forms($quant); + @quants = ($quant) if $pe->childSurf("quant") =~ /^\d+$/; + foreach $quant2 (@quants) { + if ($modp) { + push(@age_surf_forms, "$quant2-year-old"); + } else { + $plural_marker = ($quant >= 2) ? "s" : ""; + push(@age_surf_forms, "$quant2 year$plural_marker old"); + } + } + } elsif ($temporal_quant_pe && ($temporal_quant_pe->sem eq "temporal quantity")) { + @temporal_quants = $caller->quantity_surface_forms($temporal_quant_pe, $modp); + foreach $temporal_quant (@temporal_quants) { + push(@age_surf_forms, $temporal_quant . (($modp) ? "-" : " ") . "old"); + } + } else { + return ($gloss); + } + + @age_surf_forms = ($gloss) unless @age_surf_forms; + return @age_surf_forms; +} + +sub occurrence_surface_forms { + local($caller,$pe,$modp) = @_; + + @quantity_surf_forms = (); + return ($pe->gloss) unless $quant = $pe->get("quant"); + $quant_coord = $pe->get("quant-coord"); + $quant_pe = $pe->child("quant"); + $unit = "time"; + if (($pe->sem =~ /range$/) && $quant_pe) { + @quants = $caller->number_range_surface_forms($quant_pe); + } else { + @quants = $caller->number_surface_forms($quant, $quant_pe); + } + @quants = ($quant) unless @quants; + if ($modp) { + return () if $pe->get("qualifier") || $quant_coord; + return ("one-time") if $quant eq "1"; + return ("two-time", "two-fold", "2-fold") if $quant eq "2"; + } else { + if ($quant_coord) { + return $caller->qualify_entities($pe, ("once or twice")) + if $quant eq "1" and $quant_coord eq "2"; + } else { + return $caller->qualify_entities($pe, ("once")) if $quant eq "1"; + return $caller->qualify_entities($pe, ("twice", "two times", "2 times", + "2-fold", "two fold")) if $quant eq "2"; + } + } + foreach $q (@quants) { + $u = ($modp) ? $unit : $caller->noun_number_form($unit, $quant); + $new_form = "$q $u"; + if ($modp) { + # for the time being, no "more than/over/..." in modifiers: more than 20-ton + if ($pe->get("qualifier")) { + $new_form = ""; + } else { + $new_form =~ s/-/-to-/; + $new_form =~ s/ /-/g; + } + } + push(@quantity_surf_forms, $new_form) if $new_form; + push(@quantity_surf_forms, "$q-fold") if $q =~ /\d/ || ($quant <= 9); + } + @quantity_surf_forms = $caller->qualify_entities($pe,@quantity_surf_forms); + + return @quantity_surf_forms; +} + +sub quantity_surface_forms { + local($caller,$pe,$modp) = @_; + + if ($pe->get("complex") eq "true") { + return () if $modp; + $quantity_surf_form = $pe->gloss; + return ($quantity_surf_form); + } + + @quantity_surf_forms = (); + $sem = $pe->get("sem"); + $scale = $pe->get("scale"); + $scale_mod = $pe->get("scale_mod"); + $unit = $pe->get("unit") || $scale; + $mod_gloss = $pe->get("mod"); + return ($pe->gloss) unless $quant = $pe->get("quant"); + $quant_coord = $pe->get("quant-coord"); + $quant_comb = $quant_coord || $quant; + $quant_pe = $pe->child("quant"); + if (defined($u_style = $english_entity_style_ht{"\u$unit"})) { + @units = @{$u_style}; + } else { + @units = ($unit); + } + if (($pe->sem =~ /range$/) && $quant_pe) { + @quants = $caller->number_range_surface_forms($quant_pe); + } else { + @quants = $caller->number_surface_forms($quant, $quant_pe); + } + @quants = ($quant) unless @quants; + foreach $q (@quants) { + foreach $u_sing (@units) { + my $u = $u_sing; + if (($sem =~ /seismic quantity/) && $scale) { + $scale =~ s/(\w+)\s*/\u\L$1/g if $scale =~ /^(Richter|Mercalli)/i; + $u = "on the $scale_mod $scale scale"; + $u =~ s/\s+/ /g; + } elsif (($u_sing =~ /\S/) && ! $modp) { + $u = $caller->noun_number_form($u_sing, $quant_comb); + } +# print " q: $q unit: $u value: $quant modp: $modp\n"; + @mods = (""); + @mods = ("consecutive", "in a row") if $mod_gloss eq "continuous"; + foreach $mod (@mods) { + $pre_quant_mod = ""; + $in_quant_mod = ($mod =~ /(consecutive)/) ? "$mod " : ""; + $post_quant_mod = ($mod =~ /(in a row)/) ? " $mod" : ""; + $new_form = "$pre_quant_mod$q $in_quant_mod$u$post_quant_mod"; + if ($caller->is_abbreviation($u)) { + if (($pe->sem =~ /range/) && ($q =~ /^[-0-9,\. to]+$/) + && $modp && !($new_form =~ / (to|or) /)) { + $new_form =~ s/-/-to-/; + $new_form =~ s/ /-/g; + } elsif ($q =~ /^[-0-9,\.]+$/) { +# $new_form =~ s/ //g; + } else { + $new_form = ""; + } + } elsif ($modp) { + # for the time being, no "more than/over/..." in modifiers: more than 20-ton + if (($pe->get("qualifier")) || $mod) { + $new_form = ""; + } elsif ($u =~ /(square|cubic|metric|short)/) { + # no hyphenation for the time being (based on CTE style) + } elsif (($pe->sem =~ /range/) && !($new_form =~ / (to|or) /)) { + $new_form =~ s/-/-to-/; + $new_form =~ s/ /-/g; + } else { + $new_form =~ s/ /-/g; + } + } + push(@quantity_surf_forms, $new_form) + if $new_form && $caller->quantity_filter($new_form) && $caller->indef_art_filter($new_form); + } + } + } + @quantity_surf_forms = $caller->qualify_entities($pe,@quantity_surf_forms); + + # print STDERR "QSF unit:$unit sem:$sem Result(s): " . join("; ", @quantity_surf_forms) . "\n"; + return @quantity_surf_forms; +} + +sub qualify_entities { + local($caller,$pe,@surf_forms) = @_; + + $prefix = $pe->get("prefix"); + $prefix_clause = ($prefix) ? "$prefix " : ""; + if ($qualifier = $pe->get("qualifier")) { + $qualifier =~ s/-/ /g; + $qualifier_key = $qualifier; + $qualifier_key =~ s/(\w+)\s*/\u\L$1/g; + # print "qualifier_key: $qualifier_key\n"; + @new_list = (); + if (defined($value = $english_entity_style_ht{$qualifier_key})) { + @quals = @{$value}; + # print STDERR " qk $qualifier_key in ht: @quals :: @surf_forms\n"; + foreach $q (@quals) { + foreach $surf_form (@surf_forms) { + $new_form = "$prefix_clause$q $surf_form"; + push(@new_list, $new_form) if $caller->qualifier_filter($new_form); + } + } + return @new_list if @new_list; + } else { + @keys = sort keys %english_entity_style_ht; + # print STDERR " did not find qk $qualifier_key in ht: @keys\n"; + foreach $surf_form (@surf_forms) { + if (($qualifier =~ /^(couple|few|lot|many|number|several|some)$/i) + && (($art, $lex) = ($surf_form =~ /^(an?)\s+(\S|\S.*\S)\s*$/i))) { + $plural_form = $caller->noun_number_form($lex,2); + $new_form = "$prefix_clause$qualifier $plural_form"; + } else { + $new_form = "$prefix_clause$qualifier $surf_form"; + } + push(@new_list, $new_form) if $caller->qualifier_filter($new_form); + } + return @new_list if @new_list; + } + } + if ($prefix) { + @prefixed_surf_forms = (); + foreach $surf_form (@surf_forms) { + if ($surf_form =~ /^$prefix /) { # already prefixed + push(@prefixed_surf_forms, $surf_form); + } else { + push(@prefixed_surf_forms, "$prefix $surf_form"); + } + } + return @prefixed_surf_forms; + } else { + return @surf_forms; + } +} + +sub percent_surface_forms { + local($caller,$pe,$modp) = @_; + + @percent_surf_forms = (); + return @percent_surf_forms unless $pe->sem eq "percentage"; + $prefix = ""; + $quant = $pe->gloss; + $quant =~ s/%$//; + $quant =~ s/^and //; + if ($pe->gloss =~ /^and /) { + $prefix = "and"; + } + @percent_markers = $caller->entity_style_listing("Percentage"); + @quants = $caller->number_surface_forms($quant); + @quants = ($quant) unless @quants; + foreach $p (@percent_markers) { + foreach $q (@quants) { + if ($p =~ /%$/) { + if ($q =~ /\d$/) { + $percent_surf_form = $q . "%"; + $percent_surf_form = "$prefix $percent_surf_form" if $prefix; + push(@percent_surf_forms, $percent_surf_form); + push(@percent_surf_forms, "by $percent_surf_form") unless $modp || $percent_surf_form =~ /^and /; + } + } else { + if ((($p =~ /^\d/) && ($q =~ /^\d/)) + || + (($p =~ /^[a-z]/) && ($q =~ /^[a-z]/))) { + if ($p =~ /percentage point/) { + if ($quant == 1) { + $percent_surf_form = $q . " percentage point"; + } else { + $percent_surf_form = $q . " percentage points"; + } + } else { + $percent_surf_form = $q . " percent"; + } + $percent_surf_form = "$prefix $percent_surf_form" if $prefix; + $percent_surf_form =~ s/ /-/g if $modp; + push(@percent_surf_forms, $percent_surf_form); + push(@percent_surf_forms, "by $percent_surf_form") unless $modp || $percent_surf_form =~ /^and /; + } + } + } + } + return @percent_surf_forms; +} + +sub decade_century_surface_forms { + local($caller,$pe) = @_; + + if ($pe->sem =~ /century/) { + $gloss = $pe->gloss; + return ("the $gloss", "in the $gloss", $gloss); + } + @decade_surf_forms = (); + return @decade_surf_forms unless $pe->sem =~ /year range\b.*\bdecade/; + @decade_markers = @{$english_entity_style_ht{"Decade"}}; + @extend_decades = @{$english_entity_style_ht{"ExtendDecades"}}; + @extended_decades = @{$english_entity_style_ht{"ExtendedDecade"}}; + $extended_decade = (@extended_decades) ? $extended_decades[0] : "none"; + + $value = $pe->value; + $extended_value = ""; + foreach $extend_decade (@extend_decades) { + if ($extend_decade =~ /$value$/) { + $extended_value = $extend_decade unless $extended_value eq $extend_decade; + last; + } + } + if ($sub = $pe->get("sub")) { + $sub_clause = "$sub "; + $sub_clause =~ s/(mid) /$1-/; + } else { + $sub_clause = ""; + } + + if (! $extended_value) { + @values = ($value); + } elsif ($extended_decade eq "ignore") { + @values = ($value); + } elsif ($extended_decade eq "only") { + @values = ($extended_value); + } elsif ($extended_decade eq "primary") { + @values = ($extended_value, $value); + } elsif ($extended_decade eq "secondary") { + @values = ($value, $extended_value); + } else { + @values = ($value); + } + foreach $v (@values) { + foreach $dm (@decade_markers) { + $dm_ending = $dm; + $dm_ending =~ s/^\d+//; + push (@decade_surf_forms, "the $sub_clause$v$dm_ending"); + push (@decade_surf_forms, "in the $sub_clause$v$dm_ending"); + push (@decade_surf_forms, "$sub_clause$v$dm_ending"); + } + } + return @decade_surf_forms; +} + +sub day_of_the_month_surface_forms { + local($caller,$pe) = @_; + + @dom_surf_forms = (); + return @dom_surf_forms + unless ($pe->sem eq "day of the month") + && ($day_number = $pe->get("day-number")); + @dom_markers = @{$english_entity_style_ht{"DayOfTheMonth"}}; + foreach $dm (@dom_markers) { + $ord = $caller->numeric_ordinal_form($day_number); + if ($dm eq "on the 5th") { + push (@dom_surf_forms, "on the $ord"); + } elsif ($dm eq "the 5th") { + push (@dom_surf_forms, "the $ord"); + } elsif ($dm eq "5th") { + push (@dom_surf_forms, $ord); + } + } + return @dom_surf_forms; +} + +sub score_surface_forms { + local($caller,$pe) = @_; + + @score_surf_forms = (); + if (($score1 = $pe->get("score1")) + && ($score2 = $pe->get("score2"))) { + @score_markers = @{$english_entity_style_ht{"ScoreMarker"}}; + @score_markers = (":") unless @score_markers; + foreach $sm (@score_markers) { + push (@score_surf_forms, "$score1$sm$score2"); + } + } + push(@score_surf_forms, $pe->gloss) unless @score_surf_forms; + return @score_surf_forms; +} + +sub day_of_the_week_surface_forms { + local($caller,$pe) = @_; + + @dom_surf_forms = (); + @dom_markers = @{$english_entity_style_ht{"DayOfTheWeek"}}; + $gloss = $pe->get("gloss"); + $weekday = $pe->get("weekday"); + $weekday = $gloss if ($weekday eq "") && ($gloss =~ /^\S+$/); + $relday = $pe->get("relday"); + $period = $pe->get("period"); + foreach $dm (@dom_markers) { + if (($dm =~ /NOPERIOD/) && $period) { + $surf = ""; # bad combination + } elsif (($dm eq "Sunday") || ! $relday) { + $surf = $weekday; + $surf .= " $period" if $period; + } elsif ($dm =~ /morning/) { + if ($period) { + $surf = $dm; + $surf =~ s/tomorrow/$relday/; + $surf =~ s/morning/$period/; + $surf =~ s/Sunday/$weekday/; + } else { + $surf = ""; # bad combination + } + } else { + $surf = $dm; + if ($period) { + if ($relday eq "today") { + $core_surf = "this $period"; + } else { + $core_surf = "$relday $period"; + } + } else { + $core_surf = $relday; + } + $surf =~ s/tomorrow/$core_surf/; + $surf =~ s/Sunday/$weekday/; + } + $surf =~ s/yesterday night/last night/; + $surf =~ s/this noon, ($weekday)(,\s*)?/today, $1, at noon/; + $surf =~ s/this noon/today at noon/; + $surf =~ s/this night/tonight/; + $surf =~ s/\s*NOPERIOD\s*$//; + push (@dom_surf_forms, $surf) unless $util->member($surf, @dom_surf_forms) || ! $surf; + $on_weekday = "on $surf"; + push (@dom_surf_forms, $on_weekday) + if ($surf eq $weekday) && ! $util->member($on_weekday, @dom_surf_forms); + } + return @dom_surf_forms; +} + +sub date_surface_forms { + local($caller,$pe,$modp) = @_; + + @date_surf_forms = (); + $sem = $pe->sem; + $synt = $pe->synt; + return @date_surf_forms unless $sem =~ /date(\+year)?/; + $day = $pe->get("day"); + $weekday = $pe->get("weekday"); + $month_name = $pe->get("month-name"); + $month_number = $pe->get("month-number"); + $year = $pe->get("year"); + $era = $pe->get("era"); + $era_clause = ""; + $calendar_type = $pe->get("calendar"); + $calendar_type_clause = ""; + $calendar_type_clause = " AH" if $calendar_type eq "Islamic"; + $ad_year = $year; + if ($era eq "Republic era") { + $ad_year = $year + 1911; + $era_clause = " (year $year of the $era)"; + } + $rel = $pe->get("rel"); + if ($sep = $pe->get("sep")) { + $date_surf_form = "$month_number$sep$day"; + $date_surf_form .= "$sep$year" if $year; + $date_surf_form = "$weekday, $date_surf_form" if $weekday; + $date_surf_form = "on $date_surf_form" if $synt eq "pp"; + return ($date_surf_form); + } + @date_months = @{$english_entity_style_ht{"DateMonth"}}; + @date_days = @{$english_entity_style_ht{"DateDay"}}; + @date_order = @{$english_entity_style_ht{"DateOrder"}}; + foreach $m (@date_months) { + if ($m eq "September") { + $surf_month = $month_name; + } elsif ($m =~ /^Sep(\.)?$/) { + if ($month_name eq "May") { + $surf_month = $month_name; + } else { + $period_clause = ($m =~ /\.$/) ? "." : ""; + $surf_month = substr($month_name, 0, 3) . $period_clause; + } + } elsif ($m =~ /^Sept(\.)?$/) { + if ($util->member($month_name, "February", "September")) { + $period_clause = ($m =~ /\.$/) ? "." : ""; + $surf_month = substr($month_name, 0, 4) . $period_clause; + } else { + $surf_month = ""; + } + } else { + $surf_month = ""; + } + foreach $d (@date_days) { + if ($d =~ /^\d+$/) { + $surf_day = $day; + } elsif ($d =~ /^\d+[sthrd]+$/) { + $surf_day = $caller->numeric_ordinal_form($day); + } else { + $surf_day = ""; + } + if ($surf_month && $surf_day) { + foreach $o (@date_order) { + if ($calendar_type eq "Islamic") { + $date_surf_form = "$surf_day $surf_month"; + } elsif ($o eq "September 6, 1998") { + $date_surf_form = "$surf_month $surf_day"; + } elsif ($o eq "6 September, 1998") { + $date_surf_form = "$surf_day $surf_month"; + } + $date_surf_form = "$weekday, $date_surf_form" if $weekday; + $consider_on_p = 1; + if ($year) { + $date_surf_form .= "," unless $calendar_type eq "Islamic"; + $date_surf_form .= " $ad_year$calendar_type_clause$era_clause"; + } elsif ($rel) { + if ($rel eq "current") { + $date_surf_form = "this $date_surf_form"; + } else { + $date_surf_form = "$rel $date_surf_form"; + } + $consider_on_p = 0; + } + push(@date_surf_forms, $date_surf_form) + unless $util->member($date_surf_form, @date_surf_forms) || ($synt eq "pp"); + if ($consider_on_p) { + $on_date_surf_form = "on $date_surf_form"; + push(@date_surf_forms, $on_date_surf_form) + unless $modp || $util->member($on_date_surf_form, @date_surf_forms); + } + + if (($synt eq "pp") && ($sem eq "date")) { + push(@date_surf_forms, $date_surf_form) + unless $util->member($date_surf_form, @date_surf_forms); + } + } + } + } + } + return @date_surf_forms; + # rel, last, next, this +} + +sub numeric_ordinal_form { + local($caller,$cardinal) = @_; + + return $cardinal . "th" if $cardinal =~ /1\d$/; + return $cardinal . "st" if $cardinal =~ /1$/; + return $cardinal . "nd" if $cardinal =~ /2$/; + return $cardinal . "rd" if $cardinal =~ /3$/; + return $cardinal . "h" if $cardinal =~ /t$/; + $cardinal =~ s/y$/ie/; + return $cardinal . "th"; +} + +sub guard_urls_x045 { + local($caller, $s) = @_; + + # URLs (http/https/ftp/mailto) + my $result = ""; + while (($pre, $url, $post) = ($s =~ /^(.*?)((?:(?:https?|ftp):\/\/|mailto:)[#%-;=?-Z_-z~]*[-a-zA-Z0-9\/#])(.*)$/)) { + $result .= "$pre\x04$url\x05"; + $s = $post; + } + $result .= $s; + + # emails + $s = $result; + $result = ""; + while (($pre, $email, $post) = ($s =~ /^(.*?[ ,;:()\/\[\]{}<>|"'])([a-z][-_.a-z0-9]*[a-z0-9]\@[a-z][-_.a-z0-9]*[a-z0-9]\.(?:[a-z]{2,}))([ .,;:?!()\/\[\]{}<>|"'].*)$/i)) { + $result .= "$pre\x04$email\x05"; + $s = $post; + } + $result .= $s; + + # (Twitter style) #hashtag or @handle + $s = $result; + $result = ""; + while (($pre, $hashtag, $post) = ($s =~ /^(.*?[ .,;()\[\]{}'])([#@](?:[a-z]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|HHERE)(?:[_a-z0-9]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF])*(?:[a-z0-9]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF]))(.*)$/i)) { + $result .= "$pre\x04$hashtag\x05"; + $s = $post; + } + $result .= $s; + + # Keep together number+letter in: Fig. 4g; Chromosome 12p + $result =~ s/((?:\b(?:fig))(?:_DONTBREAK_)?\.?|\b(?:figures?|tables?|chromosomes?)|<xref\b[^<>]*\b(?:fig)\b[^<>]*>)\s*(\d+[a-z])\b/$1 \x04$2\x05/gi; + + # special combinations, e.g. =/= emoticons such as :) + $s = $result; + $result = ""; + while (($pre, $special, $post) = ($s =~ /^(.*?)(:-?\)|:-?\(|=\/=?|\?+\/\?+|=\[)(.*)$/)) { + $result .= "$pre\x04$special\x05"; + $s = $post; + } + $result .= $s; + + return $result; +} + +sub guard_xml_tags_x0123 { + local($caller, $s) = @_; + + my $result = ""; + # xml tag might or might not already have "@" on left and/or right end: @<br>@ + while (($pre, $tag, $post) = ($s =~ /^(.*?)(\@?<\/?(?:[a-z][-_:a-z0-9]*)(?:\s+[a-z][-_:a-z0-9]*="[^"]*")*\s*\/?>\@?|&(?:amp|gt|lt|quot);|\[(?:QUOTE|URL)=[^ \t\n\[\]]+\]|\[\/?(?:QUOTE|IMG|INDENT|URL)\]|<\$[-_a-z0-9]+\$>|<\!--.*?-->)(.*)$/si)) { + $result .= $pre; + if (($pre =~ /\S$/) && ($tag =~ /^\S/)) { + $result .= " \x01"; + $result .= "\@" if ($tag =~ /^<[a-z]/i) && (! ($pre =~ /[,;(>]$/)); #) + } else { + $result .= "\x01"; + } + $guarded_tag = $tag; + $guarded_tag =~ s/ /\x02/g; + # print STDERR "tag: $tag\nguarded_tag: $guarded_tag\n" if ($result =~ /Harvey/) || ($s =~ /Harvey/); + $result .= $guarded_tag; + if (($tag =~ /\S$/) && ($post =~ /^\S/)) { # ( + $result .= "\@" if (($tag =~ /^<\//) || ($tag =~ /\/>$/)) && (! ($result =~ /\@$/)) && (! ($post =~ /^[,;)<]/)); + $result .= "\x03 "; + } else { + $result .= "\x03"; + } + $s = $post; + } + $result .= $s; + return $result; +} + +sub restore_urls_x045_guarded_string { + local($caller, $s) = @_; + + my $orig = $s; + while (($pre, $url, $post) = ($s =~ /^(.*?)\x04([^\x04\x05]*?)\x05(.*)$/)) { + $url =~ s/ \@([-:\/])/$1/g; + $url =~ s/([-:\/])\@ /$1/g; + $url =~ s/ //g; + $url =~ s/\x02/ /g; + $s = "$pre$url$post"; + } + if ($s =~ /[\x04\x05]/) { + print STDERR "Removing unexpectedly unremoved x04/x05 marks from $s\n"; + $s =~ s/[\x04\x05]//g; + } + return $s; +} + +sub restore_xml_tags_x0123_guarded_string { + local($caller, $s) = @_; + + my $result = ""; + while (($pre, $tag, $post) = ($s =~ /^(.*?)\x01(.*?)\x03(.*)$/)) { + $result .= $pre; + $tag =~ s/ \@([-:\/])/$1/g; + $tag =~ s/([-:\/])\@ /$1/g; + $tag =~ s/ //g; + $tag =~ s/\x02/ /g; + $result .= $tag; + $s = $post; + } + $result .= $s; + return $result; +} + +sub load_english_abbreviations { + local($caller, $filename, *ht, $verbose) = @_; + # e.g. /nfs/nlg/users/textmap/brahms-ml/arabic/data/EnglishAbbreviations.txt + + $verbose = 1 unless defined($verbose); + my $n = 0; + if (open(IN, $filename)) { + while (<IN>) { + next if /^\# /; + s/\s*$//; + my @expansions; + if (@expansions = split(/\s*::\s*/, $_)) { + my $abbrev = shift @expansions; + $ht{IS_ABBREVIATION}->{$abbrev} = 1; + $ht{IS_LC_ABBREVIATION}->{(lc $abbrev)} = 1; + foreach $expansion (@expansions) { + $ht{ABBREV_EXPANSION}->{$abbrev}->{$expansion} = 1; + $ht{ABBREV_EXPANSION_OF}->{$expansion}->{$abbrev} = 1; + } + $n++; + } + } + close(IN); + print STDERR "Loaded $n entries from $filename\n" if $verbose; + } else { + print STDERR "Can't open $filename\n"; + } +} + +sub load_split_patterns { + local($caller, $filename, *ht) = @_; + # e.g. /nfs/nlg/users/textmap/brahms-ml/arabic/data/BioSplitPatterns.txt + + my $n = 0; + if (open(IN, $filename)) { + while (<IN>) { + next if /^\# /; + s/\s*$//; + if (($s) = ($_ =~ /^SPLIT-DASH-X\s+(\S.*\S|\S)\s*$/)) { + $ht{SPLIT_DASH_X}->{$s} = 1; + $ht{LC_SPLIT_DASH_X}->{(lc $s)} = 1; + $n++; + } elsif (($s) = ($_ =~ /^SPLIT-X-DASH\s+(\S.*\S|\S)\s*$/)) { + $ht{SPLIT_X_DASH}->{$s} = 1; + $ht{LC_SPLIT_X_DASH}->{(lc $s)} = 1; + $n++; + } elsif (($s) = ($_ =~ /^DO-NOT-SPLIT-DASH-X\s+(\S.*\S|\S)\s*$/)) { + $ht{DO_NOT_SPLIT_DASH_X}->{$s} = 1; + $ht{LC_DO_NOT_SPLIT_DASH_X}->{(lc $s)} = 1; + $n++; + } elsif (($s) = ($_ =~ /^DO-NOT-SPLIT-X-DASH\s+(\S.*\S|\S)\s*$/)) { + $ht{DO_NOT_SPLIT_X_DASH}->{$s} = 1; + $ht{LC_DO_NOT_SPLIT_X_DASH}->{(lc $s)} = 1; + $n++; + } elsif (($s) = ($_ =~ /^DO-NOT-SPLIT\s+(\S.*\S|\S)\s*$/)) { + $ht{DO_NOT_SPLIT}->{$s} = 1; + $ht{LC_DO_NOT_SPLIT}->{(lc $s)} = 1; + $n++; + } elsif (($s) = ($_ =~ /^SPLIT\s+(\S.*\S|\S)\s*$/)) { + $ht{SPLIT}->{$s} = 1; + $ht{LC_SPLIT}->{(lc $s)} = 1; + $n++; + } + } + close(IN); + print STDERR "Loaded $n entries from $filename\n"; + } else { + print STDERR "Can't open $filename\n"; + } +} + +sub guard_abbreviations_with_dontbreak { + local($caller, $s, *ht) = @_; + + my $orig = $s; + my $result = ""; + while (($pre,$potential_abbrev,$period,$post) = ($s =~ /^(.*?)((?:[a-z]+\.-?)*(?:[a-z]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF])+)(\.)(.*)$/i)) { + if (($pre =~ /([-&\/0-9]|[-\/]\@ )$/) + && (! ($pre =~ /\b[DR](?: \@)?-(?:\@ )?$/))) { # D-Ariz. + $result .= "$pre$potential_abbrev$period"; + } else { + $result .= $pre . $potential_abbrev; + $potential_abbrev_with_period = $potential_abbrev . $period; + if ($ht{IS_ABBREVIATION}->{$potential_abbrev_with_period}) { + $result .= "_DONTBREAK_"; + } elsif ($ht{IS_LC_ABBREVIATION}->{(lc $potential_abbrev_with_period)}) { + $result .= "_DONTBREAK_"; + } + $result .= $period; + } + $s = $post; + } + $result .= $s; + $result =~ s/\b([Nn])o\.(\s*\d)/$1o_DONTBREAK_.$2/g; + return $result; +} + +$alpha = "(?:[a-z]|\xCE[\xB1-\xBF]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|[\xD0-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF])"; +$alphanum = "(?:[a-z0-9]|\xCE[\xB1-\xBF]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|[\xD0-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF])(?:[-_a-z0-9]|\xCE[\xB1-\xBF]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF])*(?:[a-z0-9]|\xCE[\xB1-\xBF]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF])|(?:[a-z0-9]|\xCE[\xB1-\xBF]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF])"; + +sub normalize_punctuation { + local($caller, $s) = @_; + + $s =~ s/\xE2\x80[\x93\x94]/-/g; # ndash, mdash to hyphen + $s =~ s/ \@([-\/])/$1/g; + $s =~ s/([-\/])\@ /$1/g; + return $s; +} + +sub update_replace_characters_based_on_context { + local($caller, $s) = @_; + + # This is just a start. Collect stats over text with non-ASCII, e.g. K?ln. + # HHERE + my $rest = $s; + $s = ""; + while (($pre, $left, $repl_char, $right, $post) = ($rest =~ /^(.*?\s+)(\S*)(\xEF\xBF\xBD)(\S*)(\s.*)$/)) { + $s .= "$pre$left"; + if (($left =~ /[a-z]$/i) && ($right =~ /^s(?:[-.,:;?!].*)?$/i)) { # China's etc. + $repl_char = "\xE2\x80\x99"; # right single quotation mark + } elsif (($left =~ /n$/i) && ($right =~ /^t$/i)) { # don't etc. + $repl_char = "\xE2\x80\x99"; # right single quotation mark + } elsif (($left =~ /[a-z]\s*[.]$/i) && ($right eq "")) { # end of sentence + $repl_char = "\xE2\x80\x9D"; # right double quotation mark + } elsif (($left eq "") && ($right =~ /^[A-Z]/i)) { # start of word + $repl_char = "\xE2\x80\x9C"; # left double quotation mark + } + $s .= "$repl_char$right"; + $rest = $post; + } + $s .= $rest; + + return $s; +} + +sub tokenize { + local($caller, $s, *ht, $control) = @_; + + my $local_verbose = 0; + print "Point A: $s\n" if $local_verbose; + $control = "" unless defined($control); + my $bio_p = ($control =~ /\bbio\b/); + + $s = $utf8->repair_misconverted_windows_to_utf8_strings($s); + print "Point A2: $s\n" if $local_verbose; + $s = $utf8->delete_weird_stuff($s); + print "Point B: $s\n" if $local_verbose; + + # reposition xml-tag with odd space + $s =~ s/( +)((?:<\/[a-z][-_a-z0-9]*>)+)(\S)/$2$1$3/ig; + $s =~ s/(\S)((?:<[a-z][^<>]*>)+)( +)/$1$3$2/ig; + print "Point C: $s\n" if $local_verbose; + + $a_value = $ht{IS_ABBREVIATION}->{"Fig."} || "n/a"; + $s = $caller->guard_abbreviations_with_dontbreak($s, *ht); + my $standard_abbrev_s = "Adm|al|Apr|Aug|Calif|Co|Dec|Dr|etc|e.g|Feb|Febr|Gen|Gov|i.e|Jan|Ltd|Lt|Mr|Mrs|Nov|Oct|Pfc|Pres|Prof|Sen|Sept|U.S.A|U.S|vs"; + my $pre; + my $core; + my $post; + $s = " $core " if ($pre,$core,$post) = ($s =~ /^(\s*)(.*?)(\s*)$/i); + $s =~ s/\xE2\x80\x89/ /g; # thin space + $standard_abbrev_s =~ s/\./\\\./g; + $s =~ s/[\x01-\x05]//g; + $s = $caller->guard_urls_x045($s); + $s = $caller->guard_xml_tags_x0123($s); + $s = $caller->update_replace_characters_based_on_context($s); + $s =~ s/((?:[a-zA-Z_]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF])\.)([,;]) /$1 $2 /g; + $s =~ s/((?:[a-zA-Z_]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF])\.)(\x04)/$1 $2/g; + if ($bio_p) { + $s =~ s/(\S)((?:wt\/|onc\/)?(?:[-+]|\?+|\xE2\x80[\x93\x94])\/(?:[-+]|\?+|\xE2\x80[\x93\x94]))/$1 $2/g; + $s =~ s/((?:[-+]|\xE2\x80[\x93\x94])\/(?:[-+]|\xE2\x80[\x93\x94]))(\S)/$1 $2/g; + } + print "Point D: $s\n" if $local_verbose; + $s =~ s/(~+)/ $1 /g; + $s =~ s/((?:\xE2\x80\xB9|\xE2\x80\xBA|\xC2\xAB|\xC2\xBB|\xE2\x80\x9E)+)/ $1 /g; # triangular bracket(s) "<" or ">" etc. + $s =~ s/(``)([A-Za-z])/$1 $2/g; # added Nov. 30, 2017 + $s =~ s/((?:<|<)?=+(?:>|>)?)/ $1 /g; # include arrows + $s =~ s/(\\")/ $1 /g; + $s =~ s/([^\\])("+)/$1 $2 /g; + $s =~ s/([^\\])((?:\xE2\x80\x9C)+)/$1 $2 /g; # open " + $s =~ s/([^\\])((?:\xE2\x80\x9D)+)/$1 $2 /g; # close " + $s =~ s/((?:<|<)?-{2,}(?:>|>)?)/ $1 /g; # include arrows + $s =~ s/((?:\xE2\x80\xA6)+)/ $1 /g; # ellipsis + print "Point E: $s\n" if $local_verbose; + foreach $_ ((1..2)) { + # colon + $s =~ s/([.,;])(:+)/$1 \@$2/g; + $s =~ s/(:+)([.,;])/$1 \@\@ $2/g; + # # question mark/exclamation mark blocks + # $s =~ s/([^!?])([!?]+)([^!?])/$1 $2 $3/g; + } + print "Point F: $s\n" if $local_verbose; + $s =~ s/(\?)/ $1 /g; + $s =~ s/(\!)/ $1 /g; + $s =~ s/ +/ /g; + $s =~ s/(\$+|\xC2\xA3|\xE2\x82[\xA0-\xBE])/ $1 /g; # currency signs (Euro sign; British pound sign; Yen sign etc.) + $s =~ s/(\xC2\xA9|\xE2\x84\xA2)/ $1 /g; # copyright/trademark signs + $s =~ s/(\xC2\xB2)([-.,;:!?()])/$1 $2/g; # superscript 2 + $s =~ s/([^ ])( )/$1 $2/g; + $s =~ s/( )([^ ])/$1 $2/g; + $s =~ s/(&#\d+|&#x[0-9A-F]+);/$1_DONTBREAK_;/gi; + $s =~ s/([\@\.]\S*\d)([a-z][A-z])/$1_DONTBREAK_$2/g; # email address, URL + $s =~ s/ ($standard_abbrev_s)\./ $1_DONTBREAK_\./gi; + $s =~ s/ ($standard_abbrev_s) \. (\S)/ $1_DONTBREAK_\. $2/gi; + $s =~ s/\b((?:[A-Za-z]\.){1,3}[A-Za-z])\.\s+/$1_DONTBREAK_\. /g; # e.g. a.m. O.B.E. + $s =~ s/([ ])([A-Z])\. ([A-Z])/$1$2_DONTBREAK_\. $3/; # e.g. George W. Bush + $s =~ s/(\S\.*?[ ])([A-Z])_DONTBREAK_\. (After|All|And|But|Each|Every|He|How|In|It|My|She|So|That|The|Then|There|These|They|This|Those|We|What|When|Which|Who|Why|You)([', ])/$1$2\. $3$4/; # Exceptions to previous line, e.g. "plan B. This" + $s =~ s/\b(degrees C|[Ff]ig\.? \d+ ?[A-Z]|(?:plan|Scud) [A-Z])_DONTBREAK_\./$1\./g; # Exception, e.g. "plan B"; + $s =~ s/([^-_a-z0-9])(art|fig|no|p)((?:_DONTBREAK_)?\.)(\d)/$1$2$3 $4/gi; # Fig.2 No.14 + $s =~ s/([^-_A-Za-z0-9])(\d+(?:\.\d+)?)(?:_DONTBREAK_)?(thousand|million|billion|trillion|min|mol|sec|kg|km|g|m|p)\b/$1$2 $3/g; # 3.4kg 1.7million 49.9p + $s =~ s/([^-_a-z0-9])((?:[1-9]|1[0-2])(?:[.:][0-5]\d)?)(?:_DONTBREAK_)?([ap]m\b|[ap]\.m(?:_DONTBREAK_)?\.)/$1$2 $3/gi; # 3.15pm 12:00p.m. 8am + print "Point H: $s\n" if $local_verbose; + + $s =~ s/(\d)([a-z][A-z])/$1 $2/g; + $s =~ s/(\w|`|'|%|[a-zA-Z]\.|[a-zA-Z]_DONTBREAK_\.)(-|\xE2\x80[\x93\x94])(\w|`|')/$1 \@$2\@ $3/g; + $s =~ s/(\w|`|'|%|[a-zA-Z]\.|[a-zA-Z]_DONTBREAK_\.)(-|\xE2\x80[\x93\x94])(\w|`|')/$1 \@$2\@ $3/g; + $s =~ s/(\w)- /$1 \@- /g; + $s =~ s/ -(\w)/ -\@ $1/g; + $s =~ s/(\d):(\d)/$1 \@:\@ $2/g; + $s =~ s/(\d)\/(\d)/$1 \@\/\@ $2/g; + $s =~ s/($alphanum)\/([,;:!?])/$1 \@\/\@ $2/g; + $s =~ s/($alphanum)([-+]+)\/($alphanum)/$1$2 \@\/\@ $3/gi; + print "Point I: $s\n" if $local_verbose; + foreach $_ ((1..5)) { + $s =~ s/([ \/()])($alphanum) ?\/ ?($alphanum)([-+ \/().,;])/$1$2 \@\/\@ $3$4/gi; + } + $s =~ s/([a-zA-Z%\/\[\]]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF]|\x05|[a-zA-Z]_DONTBREAK_\.)([,;:!?])\s*(\S)/$1 $2 $3/g; + # asterisk + $s =~ s/( [(\[]?)(\*)([a-z0-9])/$1$2\@ $3/gi; + $s =~ s/([a-z0-9])(\*)([.,;:)\]]* )/$1 \@$2$3/gi; + print "Point J: $s\n" if $local_verbose; + + # Arabic, Cyrillic scripts + if ($s =~ /[\xD0-\xD3\xD8-\xDB]/) { + for (my $i=0; $i <= 1; $i++) { + $s =~ s/([\xD8-\xDB][\x80-\xBF])([,;:!?.\(\)\[\]\/]|\xD8\x8C|\xD8\x9B|\xD8\x9F|\xD9\xAA|\xC2\xAB|\xC2\xBB|\xE2[\x80-\x9F][\x80-\xBF])/$1 $2/gi; # punctuation includes Arabic ,;?% + $s =~ s/([,;:!?.\(\)\[\]\/]|\xD8\x8C|\xD8\x9B|\xD8\x9F|\xD9\xAA|\xC2\xAB|\xC2\xBB|\xE2[\x80-\x9F][\x80-\xBF])([\xD8-\xDB][\x80-\xBF])/$1 $2/gi; + } + } + $s =~ s/(\d|[a-zA-Z]|[\xD0-\xD3\xD8-\xDB][\x80-\xBF])([-]|\xE2\x80[\x93\x94])([\xD0-\xD3\xD8-\xDB][\x80-\xBF])/$1 \@$2\@ $3/g; + $s =~ s/(\d|[a-zA-Z])([\xD8-\xDB][\x80-\xBF])/$1 \@\@ $2/g; + $s =~ s/($alpha)(-|\xE2\x80[\x93\x94]|\xEF\xBF\xBD\.*) /$1 \@$2 /g; # added Nov. 30, 2020 # \xEF\xBF\xBD: repl.char. + $s =~ s/ (-|\xE2\x80[\x93\x94]|\xE2\x99\xA6|\xEF\xBF\xBD)($alpha)/ $1\@ $2/g; # added Nov. 30, 2020 + print "Point K: $s\n" if $local_verbose; + + # misc. non-ASCII punctuation + $s =~ s/(\xC2[\xA1\xBF]|\xD5\x9D|\xD6\x89|\xD8[\x8C\x9B]|\xD8\x9F|\xD9[\xAA\xAC]|\xDB\x94|\xDC[\x80\x82])/ $1 /g; + $s =~ s/(\xE0\xA5[\xA4\xA5]|\xE0\xBC[\x84-\x86\x8D-\x8F\x91\xBC\xBD])/ $1 /g; + $s =~ s/(\xE1\x81[\x8A\x8B]|\xE1\x8D[\xA2-\xA6])/ $1 /g; + $s =~ s/(\xE1\x81[\x8A\x8B]|\xE1\x8D[\xA2-\xA6]|\xE1\x9F[\x94\x96])/ $1 /g; + $s =~ s/([^0-9])(5\xE2\x80\xB2)(-)([ACGTU])/$1 $2 \@$3\@ $4/g; # 5-prime-DNA-seq. + $s =~ s/([^0-9])([35]\xE2\x80\xB2)/$1 $2 /g; # prime (keep 3-prime/5-prime together for bio domain) + $s =~ s/([^0-9])(\xE2\x80\xB2)/$1 $2 /g; # prime + $s =~ s/(\xE2\x81\x99)/ $1 /g; # five dot punctuation + $s =~ s/(\xE3\x80[\x81\x82\x8A-\x91]|\xE3\x83\xBB|xEF\xB8\xB0|\xEF\xBC\x8C)/ $1 /g; + $s =~ s/(\xEF\xBC[\x81-\x8F\x9A\x9F])/ $1 /g; # CJK fullwidth punctuation (e.g. fullwidth exclamation mark) + print "Point L: $s\n" if $local_verbose; + # spaces + $s =~ s/((?:\xE3\x80\x80)+)/ $1 /g; # idiographic space + $s =~ s/((?:\xE1\x8D\xA1)+)/ $1 /g; # Ethiopic space + + # isolate \xF0 and up from much more normal characters + $s =~ s/([\xF0-\xFE][\x80-\xBF]*)([\x00-\x7F\xC0-\xDF][\x80-\xBF]*)/$1 $2/g; + $s =~ s/([\x00-\x7F\xC0-\xDF][\x80-\xBF]*)([\xF0-\xFE][\x80-\xBF]*)/$1 $2/g; + print "Point M: $s\n" if $local_verbose; + + $s =~ s/( \d+)([,;:!?] )/$1 $2/g; + $s =~ s/ ([,;()\[\]])([a-zA-Z0-9.,;])/ $1 $2/g; + $s =~ s/(\)+)([-\/])([a-zA-Z0-9])/$1 $2 $3/g; + $s =~ s/([0-9\*\[\]()]|\xE2\x80\xB2)([.,;:] )/$1 $2/g; + $s =~ s/([a-zA-Z%]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF]|\x05)([,;:.!?])([")]|''|\xE2\x80[\x99\x9D]|)(\s)/$1 $2 $3$4/g; + $s =~ s/([a-zA-Z%]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF]|\x05)([,;:.!?])([")]|''|\xE2\x80[\x99\x9D]|)\s*$/$1 $2 $3/g; + $s =~ s/([.,;:]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF]|\x05)('|\xE2\x80[\x99\x9D])/$1 $2/g; + $s =~ s/('|\xE2\x80[\x99\x9D])([.,;:]|\x04)/$1 $2/g; + $s =~ s/([(){}\[\]]|\xC2\xB1)/ $1 /g; + $s =~ s/([a-zA-Z0-9]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF]|\x05)\.\s*$/$1 ./g; + $s =~ s/([a-zA-Z]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF]|\x05)\.\s+/$1 . /g; + $s =~ s/([a-zA-Z]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF]|\x05)\.(\x04)/$1 . $2/g; + $s =~ s/([0-9]),\s+(\S)/$1 , $2/g; + $s =~ s/([a-zA-Z])(\$)/$1 $2/g; + $s =~ s/(\$|[~<=>]|\xC2\xB1|\xE2\x89[\xA4\xA5]|\xE2\xA9[\xBD\xBE])(\d)/$1 $2/g; + $s =~ s/(RMB)(\d)/$1 $2/g; + print "Point N: $s\n" if $local_verbose; + foreach $_ ((1..2)) { + $s =~ s/([ '"]|\xE2\x80\x9C)(are|could|did|do|does|had|has|have|is|should|was|were|would)(n't|n\xE2\x80\x99t)([ '"]|\xE2\x80\x9D)/$1 $2 $3 $4/gi; + $s =~ s/ (can)(not) / $1 $2 /gi; + $s =~ s/ (ca)\s*(n)('t|\xE2\x80\x99t) / $1$2 $2$3 /gi; + $s =~ s/ ([Ww])o\s*n('|\xE2\x80\x99)t / $1ill n$2t /g; + $s =~ s/ WO\s*N('|\xE2\x80\x99)T / WILL N$1T /g; + $s =~ s/ ([Ss])ha\s*n('|\xE2\x80\x99)t / $1hall n$2t /g; + $s =~ s/ SHAN('|\xE2\x80\x99)T / SHALL N$1T /g; + # $s =~ s/ ain('|\xE2\x80\x99)t / is n$1t /g; + # $s =~ s/ Ain('|\xE2\x80\x99)t / Is n$1t /g; + # $s =~ s/ AIN('|\xE2\x80\x99)T / IS N$1T /g; + } + print "Point O: $s\n" if $local_verbose; + $s =~ s/(\d)%/$1 %/g; + $s =~ s/ '(d|ll|m|re|s|ve|em) / '_DONTBREAK_$1 /g; # 'd = would; 'll = will; 'em = them + $s =~ s/ \xE2\x80\x99t(d|ll|m|re|s|ve) / \xE2\x80\x99t_DONTBREAK_$1 /g; + $s =~ s/([^0-9a-z'.])('|\xE2\x80\x98)([0-9a-z])/$1$2 $3/gi; + $s =~ s/([0-9a-z])(\.(?:'|\xE2\x80\x99))([^0-9a-z']|\xE2\x80\x99)/$1 $2$3/gi; + $s =~ s/([0-9a-z]_?\.?)((?:'|\xE2\x80\x99)(?:d|ll|m|re|s|ve|))([^0-9a-z'])/$1 $2$3/gi; + $s =~ s/([("]|\xE2\x80\x9C|'')(\w)/$1 $2/g; + print "Point P: $s\n" if $local_verbose; + $s =~ s/(\w|[.,;:?!])([")]|''|\xE2\x80\x9D)/$1 $2/g; + $s =~ s/ ([,;()\[\]])([a-zA-Z0-9.,;])/ $1 $2/g; + $s =~ s/([a-z0-9]) ?(\()([-+_ a-z0-9\/]+)(\))/$1 $2 $3 $4 /ig; + $s =~ s/([a-z0-9]) ?(\[)([-+_ a-z0-9\/]+)(\])/$1 $2 $3 $4 /ig; + $s =~ s/([a-z0-9]) ?(\{)([-+_ a-z0-9\/]+)(\})/$1 $2 $3 $4 /ig; + $s =~ s/([%])-(\d+(?:\.\d+)? ?%)/$1 \@-\@ $2/g; + $s =~ s/( )(art|No)_DONTBREAK_(\.{2,})/$1 $2$3/gi; + $s =~ s/(_DONTBREAK_\.)(\.{1,})/$1 $2/g; + print "Point Q: $s\n" if $local_verbose; + foreach $_ ((1 .. 2)) { + $s =~ s/(\s(?:[-a-z0-9()']|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF])*)(\.{2,})((?:[-a-z0-9()?!:\/']|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF])*\s|(?:[-a-z0-9()'\/]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|[\xC4-\xC9\xCE-\xD3][\x80-\xBF]|\xE0[\xA4-\xA5][\x80-\xBF]|\xE0[\xB6-\xB7][\x80-\xBF])+\.\s)/$1 $2 $3/gi; + } + $s =~ s/0s\b/0 s/g; + $s =~ s/([0-9])(\x04)/$1 $2/g; + $s =~ s/ +/ /g; + print "Point R: $s\n" if $local_verbose; + + if ($bio_p) { + foreach $_ ((1 .. 2)) { + $s =~ s/([a-z]) \@(-|\xE2\x80[\x93\x94])\@ (\d+(?:$alpha)?\d*\+?)([- \/])/$1$2$3$4/ig; + $s =~ s/([a-z]) \@(-|\xE2\x80[\x93\x94])\@ ((?:alpha|beta|kappa)\d+)([- \/])/$1$2$3$4/ig; + $s =~ s/([a-z]) \@(-|\xE2\x80[\x93\x94])\@ ((?:a|b|h|k)\d)([- \/])/$1$2$3$4/ig; + $s =~ s/([a-z0-9]) \@(-|\xE2\x80[\x93\x94])\@ ([a-z])([- \/])/$1$2$3$4/ig; + $s =~ s/([- \/])(\d*[a-z]) \@(-|\xE2\x80[\x93\x94])\@ ([a-z0-9])/$1$2$3$4/ig; + } + # mutation indicators such -/- etc. + $s =~ s/(\?\/) +(\?)/$1$2/g; + $s =~ s/([^ ?])((?:wt\/|onc\/)?(?:[-+]|\?+|\xE2\x80[\x93\x94])\/(?:[-+]|\?+|\xE2\x80[\x93\x94]))/$1 $2/g; + $s =~ s/((?:[-+]|\xE2\x80[\x93\x94])\/(?:[-+]|\xE2\x80[\x93\x94]))(\S)/$1 $2/g; + + # Erk1/2 + $rest = $s; + $s = ""; + while (($pre, $stem, $slashed_number_s, $post) = ($rest =~ /^(.*?[^-_a-z0-9])([a-z][-_a-z]*)(\d+(?:(?: \@)?\/(?:\@ )?(?:\d+))+)([^-+a-z0-9].*|)$/i)) { + if ((($pre =~ /\x04[^\x05]*$/) && ($post =~ /^[^\x04]*\x05/)) + || ($stem =~ /^(mid|pre|post|sub|to)$/i)) { + $s .= "$pre$stem$slashed_number_s"; + } else { + $s .= $pre; + my @slashed_numbers = split(/(?: \@)?\/(?:\@ )?/, $slashed_number_s); + foreach $i ((0 .. $#slashed_numbers)) { + my $number = $slashed_numbers[$i]; + $s .= "$stem$number"; + $s .= " @\/@ " unless $i == $#slashed_numbers; + } + } + $rest = $post; + } + $s .= $rest; + + # Erk-1/-2 + while (($pre, $stem, $dash1, $number1, $dash2, $number2, $post) = ($s =~ /^(.*[^-_a-z0-9])([a-z][-_a-z]*)(?: \@)?(-|\xE2\x80[\x93\x94])(?:\@ )?(\d+)(?: \@)?\/(?:\@ )?(?:\@ )?(-|\xE2\x80[\x93\x94])(?:\@ )?(\d+)([^-+a-z0-9].*|)$/i)) { + $s = "$pre$stem$dash1$number1 \@\/\@ $stem$dash2$number2$post"; + } + $rest = $s; + $s = ""; + # IFN-a/b (Slac2-a/b/c) + while (($pre, $stem, $dash, $slashed_letter_s, $post) = ($rest =~ /^(.*[^-_a-z0-9])([a-z][-_a-z0-9]*)(-|\xE2\x80[\x93\x94])([a-z](?:(?: \@)?\/(?:\@ )?(?:[a-z]))+)([^-+a-z0-9].*|)$/i)) { + if (($pre =~ /\x04[^\x05]*$/) && ($post =~ /^[^\x04]*\x05/)) { + $s .= "$pre$stem$dash1$number1$dash2$number2"; + } else { + $s .= $pre; + my @slashed_letters = split(/(?: \@)?\/(?:\@ )?/, $slashed_letter_s); + foreach $i ((0 .. $#slashed_letters)) { + my $letter = $slashed_letters[$i]; + $s .= "$stem$dash$letter"; + $s .= " @\/@ " unless $i == $#slashed_letters; + } + } + $rest = $post; + } + $s .= $rest; + + # SPLIT X-induced + my $rest = $s; + my $new_s = ""; + while (($pre, $dash, $right, $post) = ($rest =~ /^(.*?)(-|\xE2\x80[\x93\x94])([a-z]+)( .*|)$/i)) { + $new_s .= $pre; + if (($right eq "I") && ($pre =~ / [a-zA-Z][a-z]*$/)) { + # compatriots-I have a dream + $new_s .= " \@" . $dash . "\@ "; + } elsif ($ht{LC_SPLIT_DASH_X}->{($caller->normalize_punctuation(lc $right))}) { + $new_s .= " \@" . $dash . "\@ "; + } else { + $new_s .= $dash; + } + $new_s .= $right; + $rest = $post; + } + $new_s .= $rest; + $s = $new_s; + + # SPLIT ubiquinated-X + $rest = $s; + $new_s = ""; + while (($pre, $left, $dash, $post) = ($rest =~ /^(.*? |)([a-z0-9]+|'s)(-|\xE2\x80[\x93\x94])([a-z0-9].*)$/i)) { + $new_s .= "$pre$left"; + if ($ht{LC_SPLIT_X_DASH}->{($caller->normalize_punctuation(lc $left))}) { + $new_s .= " \@" . $dash . "\@ "; + } else { + $new_s .= $dash; + } + $rest = $post; + } + $new_s .= $rest; + $s = $new_s; + + # SPLIT low-frequency + $rest = $s; + $new_s = ""; + if (($pre, $left, $dash, $right, $post) = ($rest =~ /^(.*?[- ]|)([a-z]+)([-\/]|\xE2\x80[\x93\x94])([a-z]+)([- ].*|)$/i)) { + } + while (($pre, $left, $dash, $right, $post) = ($rest =~ /^(.*?[-\/ ]|)([a-z]+)((?: \@)?(?:[-\/]|\xE2\x80[\x93\x94])(?:\@ )?)([a-z]+)([-\/ ].*|)$/i)) { + $x = $caller->normalize_punctuation(lc ($left . $dash . $right)); + if ($ht{LC_SPLIT}->{($caller->normalize_punctuation(lc ($left . $dash . $right)))}) { + $pre =~ s/([-\/])$/ \@$1\@ /; + $post =~ s/^([-\/])/ \@$1\@ /; + $dash = $caller->normalize_punctuation($dash); + $new_s .= "$pre$left"; + $new_s .= " \@" . $dash . "\@ "; + $new_s .= $right; + $rest = $post; + } elsif ($pre =~ /[-\/]$/) { + $new_s .= $pre; + $rest = "$left$dash$right$post"; + } else { + $new_s .= "$pre$left"; + $rest = "$dash$right$post"; + } + } + $new_s .= $rest; + $s = $new_s; + + # DO-NOT-SPLIT X-ras + $rest = $s; + $new_s = ""; + while (($pre, $dash, $right, $post) = ($rest =~ /^(.*?) \@(-|\xE2\x80[\x93\x94])\@ ([a-z0-9]+)( .*|)$/i)) { + $new_s .= $pre; + if ($ht{LC_DO_NOT_SPLIT_DASH_X}->{($caller->normalize_punctuation(lc $right))}) { + $new_s .= $dash; + } else { + $new_s .= " \@" . $dash . "\@ "; + } + $new_s .= $right; + $rest = $post; + } + $new_s .= $rest; + $s = $new_s; + + # DO-NOT-SPLIT Caco-X + $rest = $s; + $new_s = ""; + while (($pre, $left, $dash, $post) = ($rest =~ /^(.*? |)([a-z0-9]+) \@([-\/]|\xE2\x80[\x93\x94]])\@ ([a-z0-9].*)$/i)) { + $new_s .= "$pre$left"; + if ($ht{LC_DO_NOT_SPLIT_X_DASH}->{($caller->normalize_punctuation(lc $left))}) { + $new_s .= $dash; + } else { + $new_s .= " \@" . $dash . "\@ "; + } + $rest = $post; + } + $new_s .= $rest; + $s = $new_s; + + # DO-NOT-SPLIT down-modulate (2 elements) + $rest = $s; + $new_s = ""; + while (($pre, $left, $dash, $right, $post) = ($rest =~ /^(.*? |)([a-z0-9]+) \@([-\/]|\xE2\x80[\x93\x94]])\@ ([a-z0-9]+)( .*|)$/i)) { + $new_s .= "$pre$left"; + if ($ht{LC_DO_NOT_SPLIT}->{($caller->normalize_punctuation(lc ($left . $dash . $right)))}) { + $new_s .= $dash; + } else { + $new_s .= " \@" . $dash . "\@ "; + } + $new_s .= $right; + $rest = $post; + } + $new_s .= $rest; + $s = $new_s; + + # DO-NOT-SPLIT 14-3-3 (3 elements) + $rest = $s; + $new_s = ""; + while (($pre, $left, $dash_group1, $dash1, $middle, $dash_group2, $dash2, $right, $post) = ($rest =~ /^(.*? |)([a-z0-9]+)((?: \@)?([-\/]|\xE2\x80[\x93\x94]])(?:\@ )?)([a-z0-9]+)((?: \@)?([-\/]|\xE2\x80[\x93\x94]])(?:\@ )?)([a-z0-9]+)( .*|)$/i)) { + $new_s .= "$pre$left"; + if ($ht{LC_DO_NOT_SPLIT}->{($caller->normalize_punctuation(lc ($left . $dash1 . $middle . $dash2 . $right)))}) { + $new_s .= $dash1; + } else { + $new_s .= $dash_group1; + } + $new_s .= $middle; + if ($ht{LC_DO_NOT_SPLIT}->{($caller->normalize_punctuation(lc ($left . $dash1 . $middle . $dash2 . $right)))}) { + $new_s .= $dash2; + } else { + $new_s .= $dash_group2; + } + $new_s .= $right; + $rest = $post; + } + $new_s .= $rest; + $s = $new_s; + + $s =~ s/ +/ /g; + } + print "Point S: $s\n" if $local_verbose; + + $s =~ s/_DONTBREAK_//g; + $s =~ s/( )(ark|ill|mass|miss|wash|GA|LA|MO|OP|PA|VA|VT)(\.)( )/$1$2 $3$4/g; + print "Point T: $s\n" if $local_verbose; + $s = $caller->restore_urls_x045_guarded_string($s); + $s = $caller->restore_xml_tags_x0123_guarded_string($s); + print "Point U: $s\n" if $local_verbose; + $s =~ s/(https?|ftp)\s*(:)\s*(\/\/)/$1$2$3/gi; + $s =~ s/\b(mailto)\s*(:)\s*([a-z])/$1$2$3/gi; + $s =~ s/(\d)\s*(:)\s*([0-5]\d[^0-9])/$1$2$3/gi; + print "Point V: $s\n" if $local_verbose; + $s =~ s/(5\xE2\x80\xB2-[ACGT]+)\s*(-|\xE2\x80[\x93\x94])\s*(3\xE2\x80\xB2)/$1$2$3/g; # repair broken DNA sequence + $s =~ s/ (etc) \. / $1. /g; # repair most egrareous separations + print "Point W: $s\n" if $local_verbose; + $s = $caller->repair_separated_periods($s); + print "Point X: $s\n" if $local_verbose; + $s =~ s/^\s+//; + $s =~ s/\s+$//; + $s = "$pre$s$post" if defined($pre) && defined($post); + $s =~ s/ +/ /g; + print "Point Y: $s\n" if $local_verbose; + + return $s; +} + +sub tokenize_plus_for_noisy_text { + local($caller, $s, *ht, $control) = @_; + + $control = "" unless defined($control); + my $pre; + my $code; + my $post; + $s = " $core " if ($pre,$core,$post) = ($s =~ /^(\s*)(.*?)(\s*)$/i); + foreach $i ((1 .. 2)) { + $s =~ s/ ([A-Z][a-z]+'?[a-z]+)(-) / $1 $2 /gi; # Example: Beijing- + $s =~ s/ (\d+(?:\.\d+)?)(-|:-|:|_|\.|'|;)([A-Z][a-z]+'?[a-z]+|[A-Z]{3,}) / $1 $2 $3 /gi; # Example: 3:-Maxkamado + $s =~ s/ (\d+(?:\.\d+)?)(')([A-Za-z]{3,}) / $1 $2 $3 /gi; # Example: 42'daqiiqo + $s =~ s/ (-|:-|:|_|\.)([A-Z][a-z]+'?[a-z]+|[A-Z]{3,}) / $1 $2 /gi; # Example: -Xassan + $s =~ s/ ((?:[A-Z]\.[A-Z]|[A-Z]|Amb|Col|Dr|Eng|Gen|Inj|Lt|Maj|Md|Miss|Mr|Mrs|Ms|Pres|Prof|Sen)\.)([A-Z][a-z]+|[A-Z]{2,}) / $1 $2 /gi; # Example: Dr.Smith + $s =~ s/ (\d+)(,)([a-z]{3,}) / $1 $2 $3 /gi; # Example: 24,October + $s =~ s/ (%)(\d+(?:\.\d+)?) / $1 $2 /gi; # Example: %0.6 + $s =~ s/ ([A-Za-z][a-z]{3,}\d*)([.,\/]|:\()([A-Za-z][a-z]{3,}|[A-Z]{3,}) / $1 $2 $3 /gi; # Example: Windows8,falanqeeyaal + $s =~ s/ ([A-Za-z]{3,}\d*?|[A-Za-z]+'[A-Za-z]+)([,\/]|:\()([A-Za-z]{3,}|[A-Za-z]+'[A-Za-z]+) / $1 $2 $3 /gi; # Example: GAROOWE:(SHL + $s =~ s/ (\d[0-9.,]*\d)(;)([a-z]+) / $1 $2 $3 /gi; # Example: 2.1.2014;Waraka + } + $s =~ s/^\s+//; + $s =~ s/\s+$//; + $s = "$pre$s$post" if defined($pre) && defined($post); + return $s; +} + +# preparation for sub repair_separated_periods: + +my $abbrev_s = "etc.|e.g.|i.e.|U.K.|S.p.A.|A.F.P."; +my @abbrevs = split(/\|/, $abbrev_s); +my @exp_abbrevs = (); +foreach $abbrev (@abbrevs) { + if (($core,$period) = ($abbrev =~ /^(.*?)(\.|)$/)) { + $core =~ s/\./\\s*\\.\\s*/g; + $abbrev = $core; + $abbrev .= "\\b" if $abbrev =~ /[a-z]$/i; # don't split etcetera -> etc. etera + $abbrev .= "(?:\\s*\\.|)" if $period; + push(@exp_abbrevs, $abbrev); + } +} +my $exp_abbrev_s = join("|", @exp_abbrevs); + +sub repair_separated_periods { + local($caller,$s) = @_; + + # separated or missing period + my $result = ""; + while (($pre, $abbrev, $post) = ($s =~ /^(.*? |)($exp_abbrev_s)(.*)$/)) { + $abbrev =~ s/ //g; + $abbrev .= "." unless $abbrev =~ /\.$/; + $result .= "$pre$abbrev "; + $s = $post; + } + $result .= $s; + $result =~ s/ +/ /g; + return $result; +} + +# provided by Alex Fraser +sub fix_tokenize { + local($caller,$s) = @_; + + ## change "2:15" to "2 @:@ 15" + $s =~ s/(\d)\:(\d)/$1 \@:\@ $2/g; + + ## strip leading zeros from numbers + $s =~ s/(^|\s)0+(\d)/$1$2/g; + + ## fix rule typo + $s =~ s/associatedpress/associated press/g; + + ## fix _ entities + $s =~ s/hong_kong/hong kong/g; + $s =~ s/united_states/united states/g; + + return $s; +} + +sub de_mt_tokenize { + local($caller,$s) = @_; + + $s =~ s/\s+\@([-:\/])/$1/g; + $s =~ s/([-:\/])\@\s+/$1/g; + $s =~ s/\s+\/\s+/\//g; + return $s; +} + +sub surface_forms { + local($caller,$pe,$modp) = @_; + + $sem = $pe->sem; + $surf = $pe->surf; + $synt = $pe->synt; + $value = $pe->value; + $gloss = $pe->gloss; +# $util->log("surface_forms surf:$surf sem:$sem gloss:$gloss value:$value", $logfile); + if ($sem eq "integer") { + return ($gloss) if ($gloss =~ /several/) && !($value =~ /\S/); + print STDERR "Warning: $value not an integer\n" unless $value =~ /^\d+(e\+\d+)?$/; + if ($pe->get("reliable") =~ /sequence of digits/) { + $english = $value; + $english = "$prefix $english" if $prefix = $pe->get("prefix"); + @result = ($english); + } else { + @result = $caller->q_number_surface_forms($pe); + } + } elsif ($sem eq "decimal number") { + @result = $caller->q_number_surface_forms($pe); + } elsif ($sem =~ /(integer|decimal number) range/) { + @result = $caller->number_range_surface_forms($pe); + } elsif ($sem eq "ordinal") { + if ($pe->get("definite")) { + $exclude_adverbials_p = 1; + } elsif (defined($chinesePM) && ($hao = $chinesePM->e2c("hao-day")) + && ($gc = $chinesePM->e2c("generic counter"))) { + $exclude_adverbials_p = ($surf =~ /($hao|$gc)$/); + } else { + $exclude_adverbials_p = 1; + } + @result = $caller->ordinal_surface_forms($pe->get("ordvalue") || $pe->value,0,$exclude_adverbials_p, $pe); + } elsif ($sem eq "fraction") { + @result = $caller->fraction_surface_forms($pe,$modp); + } elsif ($sem =~ /monetary quantity/) { + @result = $caller->currency_surface_forms($pe); + } elsif ($sem =~ /occurrence quantity/) { + @result = $caller->occurrence_surface_forms($pe,$modp); + } elsif ($sem =~ /score quantity/) { + @result = $caller->score_surface_forms($pe); + } elsif ($sem =~ /age quantity/) { + @result = $caller->age_surface_forms($pe, $modp); + } elsif ($sem =~ /quantity/) { + @result = $caller->quantity_surface_forms($pe,$modp); + } elsif ($sem eq "percentage") { + @result = $caller->percent_surface_forms($pe,$modp); + } elsif ($sem eq "percentage range") { + if ($gloss =~ /^and /) { + @result = ($gloss); + } else { + @result = ($gloss, "by $gloss", "of $gloss"); + } + } elsif ($sem =~ /^(month of the year|month\+year|year)$/) { + if ($synt eq "pp") { + @result = ($gloss); + } elsif ($gloss =~ /^the (beginning|end) of/) { + @result = ($gloss, "at $gloss"); + } elsif ($gloss =~ /^(last|this|current|next)/) { + @result = ($gloss); + } else { + # in November; in mid-November + @result = ($gloss, "in $gloss"); + } + } elsif ($sem =~ /date(\+year)?$/) { + @result = $caller->date_surface_forms($pe,$modp); + } elsif ($sem =~ /year range\b.*\b(decade|century)$/) { + @result = $caller->decade_century_surface_forms($pe); + } elsif ($sem eq "day of the month") { + @result = $caller->day_of_the_month_surface_forms($pe); + } elsif ($sem =~ /period of the day\+day of the week/) { + @result = ($gloss); + push(@result, "on $gloss") if $gloss =~ /^the night/; + } elsif ($sem =~ /day of the week/) { + @result = $caller->day_of_the_week_surface_forms($pe); + } elsif ($sem =~ /^(time)$/) { + if ($gloss =~ /^at /) { + @result = ($gloss); + } else { + @result = ($gloss, "at $gloss"); + } + } elsif ($sem =~ /^date range$/) { + if ($synt eq "pp") { + @result = ($gloss); + } elsif ($pe->get("between")) { + $b_gloss = "between $gloss"; + $b_gloss =~ s/-/ and /; + @result = ($b_gloss, $gloss, "from $gloss"); + } else { + @result = ($gloss, "from $gloss"); + } + } elsif ($sem =~ /^date enumeration$/) { + if ($synt eq "pp") { + @result = ($gloss); + } else { + @result = ($gloss, "on $gloss"); + } + } elsif ($pe->get("unknown-in-pc")) { + @result = (); + foreach $unknown_pos_en (split(/;;/, $pe->get("unknown-pos-en-list"))) { + ($engl) = ($unknown_pos_en =~ /^[^:]+:[^:]+:(.*)$/); + push(@result, $engl) if defined($engl) && ! $util->member($engl, @result); + } + @result = ($gloss) unless @result; + } elsif (($sem =~ /\b(name|unknown)\b/) && (($en_s = $pe->get("english")) =~ /[a-z]/i)) { + @result = split(/\s*\|\s*/, $en_s); + } elsif (($sem =~ /^proper\b/) && (($en_s = $pe->get("english")) =~ /[a-z]/i)) { + @result = split(/\s*\|\s*/, $en_s); + } else { + @result = ($gloss); + } + + if (($sem =~ /^(date\+year|month\+year|year)$/) + && ($year = $pe->get("year")) + && ($year =~ /^\d\d$/) + && (@extend_years = @{$english_entity_style_ht{"ExtendYears"}}) + && ($#extend_years == 1) + && ($extended_year_start = $extend_years[0]) + && ($extended_year_end = $extend_years[1]) + && ($extended_year_start <= $extended_year_end) + && ($extended_year_start + 99 >= $extended_year_end) + && ($extended_year_start =~ /^\d\d\d\d$/) + && ($extended_year_end =~ /^\d\d\d\d$/)) { + $century1 = substr($extended_year_start, 0, 2); + $century2 = substr($extended_year_end, 0, 2); + $exp_year1 = "$century1$year"; + $exp_year2 = "$century2$year"; + if (($extended_year_start <= $exp_year1) && ($exp_year1 <= $extended_year_end)) { + $exp_year = $exp_year1; + } elsif (($extended_year_start <= $exp_year2) && ($exp_year2 <= $extended_year_end)) { + $exp_year = $exp_year2; + } else { + $exp_year = ""; + } + if ($exp_year) { + @new_glosses = (); + foreach $old_gloss (@result) { + $new_gloss = $old_gloss; + $new_gloss =~ s/\b$year$/$exp_year/; + push (@new_glosses, $new_gloss) unless $new_gloss eq $old_gloss; + } + push (@result, @new_glosses); + } + } + + # tokenize as requested + @tokenize_list = @{$english_entity_style_ht{"Tokenize"}}; + $tokenize_p = 1 if $util->member("yes", @tokenize_list) + || $util->member("all", @tokenize_list); + $dont_tokenize_p = 1 if $util->member("no", @tokenize_list) + || $util->member("all", @tokenize_list); + if ($tokenize_p) { + @new_result = (); + foreach $item (@result) { + $t_item = $caller->tokenize($item, *dummy_ht); + push(@new_result, $item) if $dont_tokenize_p && ($item ne $t_item); + push(@new_result, $t_item); + } + @result = @new_result; + } + + # case as requested + @case_list = @{$english_entity_style_ht{"Case"}}; + $lower_case_p = $util->member("lower", @case_list) + || $util->member("all", @case_list); + $reg_case_p = $util->member("regular", @case_list) + || $util->member("all", @case_list); + if ($lower_case_p) { + @new_result = (); + foreach $item (@result) { + $l_item = "\L$item"; + push(@new_result, $item) if $reg_case_p && ($item ne $l_item); + push(@new_result, $l_item) unless $util->member($l_item, @new_result); + } + @result = @new_result; + } + # $value = "n/a" unless $value; + # print STDERR "SF surf:$surf sem:$sem gloss:$gloss value:$value Result(s): " . join("; ", @result) . "\n"; + return @result; +} + +sub case_list { + return @{$english_entity_style_ht{"Case"}}; +} + +sub right_cased_list { + local($caller, $word) = @_; + + @case_list = @{$english_entity_style_ht{"Case"}}; + + @right_cased_core_list = (); + push(@right_cased_core_list, $word) + if ($util->member("regular", @case_list) || $util->member("all", @case_list)) + && ! $util->member($word, @right_cased_core_list); + push(@right_cased_core_list, lc $word) + if ($util->member("lower", @case_list) || $util->member("all", @case_list)) + && ! $util->member(lc $word, @right_cased_core_list); + + return @right_cased_core_list; +} + +sub string2surf_forms { + local($caller, $text, $lang, $alt_sep) = @_; + + $alt_sep = " | " unless defined($alt_sep); + $lang = "zh" unless defined($lang); + + if ($lang eq "zh") { + @pes = $chinesePM->parse_entities_in_string($text); + $n = $#pes + 1; +# print " $n pes\n"; + @pes = $chinesePM->select_reliable_entities(@pes); + my @res_surf_forms_copy = $caller->reliable_pes2surf_forms($alt_sep, @pes); + return @res_surf_forms_copy; + } else { + return (); + } +} + +sub reliable_pe2surf_forms { + local($caller, $pe, $parent_reliant_suffices_p) = @_; + + $parent_reliant_suffices_p = 0 unless defined($parent_reliant_suffices_p); + if ((defined($r = $pe->get("reliable")) && $r) + || ($parent_reliant_suffices_p && ($parent_pe = $pe->get("parent")) && + $parent_pe->get("reliable"))) { + @surf_forms = $caller->surface_forms($pe); + if ((($pe->sem =~ /quantity( range)?$/) && !($pe->sem =~ /monetary quantity/)) + || ($util->member($pe->sem, "percentage","fraction"))) { + foreach $mod_form ($caller->surface_forms($pe, 1)) { + push(@surf_forms, $mod_form) unless $util->member($mod_form, @surf_forms); + } + } + return @surf_forms; + } + return (); +} + +sub reliable_pe2surf_form { + local($caller, $alt_sep, $pe) = @_; + + if (@surf_forms = $caller->reliable_pe2surf_forms($pe)) { + return $pe->surf . " == " . join($alt_sep, @surf_forms); + } else { + return ""; + } +} + +sub reliable_pes2surf_forms { + local($caller, $alt_sep, @pes) = @_; + + my @res_surf_forms = (); + foreach $pe (@pes) { + if ($new_surf_form = $caller->reliable_pe2surf_form($alt_sep, $pe)) { + push(@res_surf_forms, $new_surf_form); + } + } + return @res_surf_forms; +} + +sub string_contains_ascii_letter { + local($caller,$string) = @_; + return $string =~ /[a-zA-Z]/; +} + +sub string_starts_w_ascii_letter { + local($caller,$string) = @_; + return $string =~ /^[a-zA-Z]/; +} + +sub en_lex_bin { + local($caller, $word) = @_; + + $word =~ s/\s+//g; + $word =~ s/[-_'\/]//g; + $word =~ tr/A-Z/a-z/; + return "digit" if $word =~ /^\d/; + return "special" unless $word =~ /^[a-z]/; + return substr($word, 0, 1); +} + +sub skeleton_bin { + local($caller, $sk_bin_control, $word) = @_; + + $word =~ s/\s+//g; + $word =~ s/[-_'\/]//g; + $word =~ tr/A-Z/a-z/; + return "E" unless $word; + if ($sk_bin_control =~ /^v1/i) { + return $word if length($word) <= 2; + return substr($word, 0, 3) if $word =~ /^(b|f[lnrt]|gr|j[nr]|k|l[nt]|m|n[kmst]|r[knst]|s|t)/; + return substr($word, 0, 2); + } elsif ($sk_bin_control =~ /d6f$/) { + return $word if length($word) <= 6; + return substr($word, 0, 6); + } elsif ($sk_bin_control =~ /d5f$/) { + return $word if length($word) <= 5; + return substr($word, 0, 5); + } elsif ($sk_bin_control =~ /d4f$/) { + return $word if length($word) <= 4; + return substr($word, 0, 4); + } else { + return $word if length($word) <= 4; + return substr($word, 0, 5) if $word =~ /^(bnts|brnt|brst|brtk|brtn|brts|frst|frts|klts|kntr|knts|krst|krtn|krts|ksks|kstr|lktr|ntrs|sbrt|skrt|sntr|strn|strt|trns|trts|ts)/; + return substr($word, 0, 4); + } +} + +sub skeleton_bin_sub_dir { + local($caller, $sk_bin_control, $skeleton_bin) = @_; + + $sk_bin_control = "v1" unless defined($sk_bin_control); + return "" if $sk_bin_control =~ /^v1/i; + if ($sk_bin_control =~ /^2d4d\df$/) { + return "SH/SHOR" if (length($skeleton_bin) < 2); + return substr($skeleton_bin, 0, 2) . "/" . substr($skeleton_bin, 0, 2) . "SH" if (length($skeleton_bin) < 4); + return substr($skeleton_bin, 0, 2) . "/" . substr($skeleton_bin, 0, 4); + } elsif ($sk_bin_control =~ /^2d3d\df$/) { + return "SH/SHO" if (length($skeleton_bin) < 2); + return substr($skeleton_bin, 0, 2) . "/" . substr($skeleton_bin, 0, 2) . "S" if (length($skeleton_bin) < 3); + return substr($skeleton_bin, 0, 2) . "/" . substr($skeleton_bin, 0, 3); + } + $bin3 = "ts"; + return "SH" if (length($skeleton_bin) < 2) || ($skeleton_bin =~ /^($bin3)$/); + return substr($skeleton_bin, 0, 3) if $skeleton_bin =~ /^($bin3)/; + return substr($skeleton_bin, 0, 2); +} + +sub en_words_and_counts_matching_skeletons { + local($caller, $sk_bin_version, @skeletons) = @_; + + return () unless @skeletons; + + @rem_skeletons = sort @skeletons; + $previous_skeleton = ""; + $current_skeleton = shift @rem_skeletons; + @list = ($current_skeleton); + @lists = (); + + $current_bin = ""; + while ($current_skeleton) { + unless ($current_skeleton eq $previous_skeleton) { + $current_skeleton_bin = $caller->skeleton_bin($sk_bin_version, $current_skeleton); + unless ($current_skeleton_bin eq $current_bin) { + # need to read from new file + close(IN) if $current_bin; + $current_bin = $current_skeleton_bin; + $current_bin_subdir + = $caller->skeleton_bin_sub_dir($sk_bin_version, $current_bin); + if ($current_bin_subdir) { + $en_skeleton_file = File::Spec->catfile($english_resources_skeleton_dir, + $current_bin_subdir, + "$current_bin.txt"); + } else { + $en_skeleton_file = File::Spec->catfile($english_resources_skeleton_dir, + "$current_bin.txt"); + } + # print STDERR " Perusing $en_skeleton_file ...\n"; + if (open(IN, $en_skeleton_file)) { + $en_skeleton_file_exists = 1; + } else { + $en_skeleton_file_exists = 0; + print STDERR "Can't open $en_skeleton_file (Point A)\n"; + } + } + $previous_skeleton = $current_skeleton; + } + $_ = <IN> if $en_skeleton_file_exists; + unless ($en_skeleton_file_exists && defined($_)) { + push(@lists, join(' ; ', @list)); + if (@rem_skeletons) { + $current_skeleton = shift @rem_skeletons; + @list = ($current_skeleton); + } else { + $current_skeleton = ""; + } + next; + } + ($skeleton) = ($_ =~ /^(\S+)\t/); + next unless defined($skeleton); + $skeletons_match_p = $caller->skeletons_match_p($skeleton, $current_skeleton); + next if ($skeleton lt $current_skeleton) && ! $skeletons_match_p; + if ($skeletons_match_p) { + ($token, $count) = ($_ =~ /^\S+\t(\S|\S[-' a-zA-Z]*\S)\t(\d+)\s*$/); + push(@list, "$token : $count") if defined($token) && defined($count); + } else { + while ($current_skeleton lt $skeleton) { + push(@lists, join(' ; ', @list)); + unless (@rem_skeletons) { + close(IN) if $current_bin; + $current_skeleton = ""; + last; + } + $current_skeleton = shift @rem_skeletons; + @list = ($current_skeleton); + } + if ($caller->skeletons_match_p($skeleton, $current_skeleton)) { + ($token, $count) = ($_ =~ /^\S+\t(\S|\S[-' a-zA-Z]*\S)\t(\d+)\s*$/); + push(@list, "$token : $count") if defined($token) && defined($count); + } + } + } + close(IN) if $current_bin; + return @lists; +} + +sub skeletons_match_p { +# one of the skeletons might have been cut off at max + local($caller, $skeleton1, $skeleton2, $max) = @_; + + return 1 if $skeleton1 eq $skeleton2; + + $max = 5 unless defined($max); + if ((length($skeleton1) > length($skeleton2)) && (length($skeleton2) == $max)) { + return ($skeleton1 =~ /^$skeleton2/) ? 1 : 0; + } elsif ((length($skeleton2) > length($skeleton1)) && (length($skeleton1) == $max)) { + return ($skeleton2 =~ /^$skeleton1/) ? 1 : 0; + } else { + return 0; + } +} + +sub token_weird_or_too_long { + local($caller, *WARNING_FH, $token) = @_; + + $lc_token = lc $token; + $norm_token = $lc_token; + $norm_token =~ s/[-' ,]//g; + $snippet4_5 = ""; + $snippet4_5 = substr($norm_token, 4, 2) if length($norm_token) >= 10; + $snippet4_6 = ""; + $snippet4_6 = substr($norm_token, 4, 3) if length($norm_token) >= 10; + if (($norm_token =~ /(kkk|vvv|www|xxx|yyy|zzz)/) || + ($norm_token =~ /[acgt]{15,}/) || # DNA sequence + ($snippet4_5 && ($norm_token =~ /($snippet4_5){5,}/)) || # 2-letter repetition + ($snippet4_6 && ($norm_token =~ /($snippet4_6){4,}/)) || # 3-letter repetition + ($norm_token =~ /[bcdfghjklmnpqrstvwxz]{8,}/) || # too many consonants + ($token =~ /(DDD)/) || + (($lc_token =~ /fff/) && ! ($lc_token =~ /schifff/))) { + print WARNING_FH "skipping (WEIRD): $_"; + return 1; + } + if ((length($norm_token) >= 50) || + ((length($norm_token) >= 28) + + # typical German compound noun components + && ! ($norm_token =~ /entwicklung/) + && ! ($norm_token =~ /fabrik/) + && ! ($norm_token =~ /finanz/) + && ! ($norm_token =~ /forschung/) + && ! ($norm_token =~ /geschwindigkeit/) + && ! ($norm_token =~ /gesundheit/) + && ! ($norm_token =~ /gewohnheit/) + && ! ($norm_token =~ /schaft/) + && ! ($norm_token =~ /schifffahrt/) + && ! ($norm_token =~ /sicherheit/) + && ! ($norm_token =~ /vergangen/) + && ! ($norm_token =~ /versicherung/) + && ! ($norm_token =~ /unternehmen/) + && ! ($norm_token =~ /verwaltung/) + + # Other Germanic languages + && ! ($norm_token =~ /aktiebolag/) + && ! ($norm_token =~ /aktieselskab/) + && ! ($norm_token =~ /ontwikkeling/) + + # chemical + && ! ($norm_token =~ /phetamine/) + && ! ($norm_token =~ /ethyl/) + + # medical + && ! ($norm_token =~ /^pneumonaultramicroscopicsilicovolcanoconios[ei]s$/) + + # business + && ! ($norm_token =~ /PriceWaterhouse/) + )) { + print WARNING_FH "skipping (TOO LONG): $_"; + return 1; + } + return 0; +} + +sub xml_de_accent { + local($caller, $string) = @_; + + # for the time being, unlauts are mapped to main vowel (without "e") + + $string =~ s/\[2-7];/A/g; + $string =~ s/\Æ/Ae/g; + $string =~ s/\Ç/C/g; + $string =~ s/\[0-3];/E/g; + $string =~ s/\[4-7];/I/g; + $string =~ s/\Ð/Dh/g; + $string =~ s/\Ñ/N/g; + $string =~ s/\[0-4];/O/g; + $string =~ s/\Ø/O/g; + $string =~ s/\[7-9];/U/g; + $string =~ s/\Ü/U/g; + $string =~ s/\Ý/Y/g; + $string =~ s/\Þ/Th/g; + + $string =~ s/\ß/ss/g; + $string =~ s/\[4-9];/a/g; + $string =~ s/\æ/ae/g; + $string =~ s/\ç/c/g; + $string =~ s/\[2-5];/e/g; + $string =~ s/\[6-9];/i/g; + $string =~ s/\ð/dh/g; + $string =~ s/\ñ/n/g; + $string =~ s/\[2-6];/o/g; + $string =~ s/\ø/o/g; + $string =~ s/\ù/u/g; + $string =~ s/\[0-2];/u/g; + $string =~ s/\ý/y/g; + $string =~ s/\þ/th/g; + $string =~ s/\ÿ/y/g; + $string =~ s/\xE2\x80\x99/'/g; + + return $string; +} + +sub de_accent { + local($caller, $string) = @_; + + # for the time being, unlauts are mapped to main vowel (without "e") + + $string =~ s/\xC3[\x80-\x85]/A/g; + $string =~ s/\xC3\x86/Ae/g; + $string =~ s/\xC3\x87/C/g; + $string =~ s/\xC3[\x88-\x8B]/E/g; + $string =~ s/\xC3[\x8C-\x8F]/I/g; + $string =~ s/\xC3\x90/Dh/g; + $string =~ s/\xC3\x91/N/g; + $string =~ s/\xC3[\x92-\x96]/O/g; + $string =~ s/\xC3\x98/O/g; + $string =~ s/\xC3[\x99-\x9C]/U/g; + $string =~ s/\xC3\x9D/Y/g; + $string =~ s/\xC3\x9E/Th/g; + + $string =~ s/\xC3\x9F/ss/g; + $string =~ s/\xC3[\xA0-\xA5]/a/g; + $string =~ s/\xC3\xA6/ae/g; + $string =~ s/\xC3\xA7/c/g; + $string =~ s/\xC3[\xA8-\xAB]/e/g; + $string =~ s/\xC3[\xAC-\xAF]/i/g; + $string =~ s/\xC3\xB0/dh/g; + $string =~ s/\xC3\xB1/n/g; + $string =~ s/\xC3[\xB2-\xB6]/o/g; + $string =~ s/\xC3\xB8/o/g; + $string =~ s/\xC3[\xB9-\xBC]/u/g; + $string =~ s/\xC3\xBD/y/g; + $string =~ s/\xC3\xBE/th/g; + $string =~ s/\xC3\xBF/y/g; + $string =~ s/\xE2\x80\x99/'/g; + + return $string; +} + +sub common_non_name_cap_p { + local($caller, $word) = @_; + return defined($english_ht{(lc $word)}->{COMMON_NON_NAME_CAP}); +} + +sub language { + return "English"; +} + +sub language_id { + return "en"; +} + +sub parse_entities_in_string { + local($caller, $string) = @_; + + $ParseEntry->set_current_lang("en"); + @pes = $ParseEntry->init_ParseEntry_list($string); + @pes = $caller->lexical_heuristic(@pes); + @pes = $caller->base_number_heuristic(@pes); + + return @pes; +} + +sub lexical_heuristic { + local($caller, @pes) = @_; + + $i = 0; + while ($i <= $#pes) { + $pe = $pes[$i]; + if ($pe->undefined("synt")) { + if ($pe->surf =~ /^\d+(,\d\d\d)*\.\d+/) { + $pe->set("synt", "cardinal"); + $pe->set("sem", "decimal number"); + $value = $pe->surf; + $value =~ s/,//g; + $pe->set("value", $value); + } elsif ($pe->surf =~ /^\d+(,\d\d\d)*$/) { + $pe->set("synt", "cardinal"); + $pe->set("sem", "integer"); + $value = $pe->surf; + $value =~ s/,//g; + $pe->set("value", $value); + } elsif ($pe->surf =~ /^([-",\.;\s:()\/%]|\@[-:\/]\@|[-:\/]\@|\@[-:\/])$/) { + $pe->set("gloss", $pe->surf); + $pe->set("synt", "punctuation"); + } else { + ($length,$english) = $caller->find_max_lex_match($i,3,@pes); + if ($length) { + if ($length > 1) { + @slot_value_list = (); + @children = splice(@pes,$i,$length); + @roles = $util->list_with_same_elem($length,"lex"); + $pe = $ParseEntry->newParent(*slot_value_list,*children,*roles); + $pe->set("surf",$english); + $pe->set("eot",1) if $pe->eot_p; + splice(@pes,$i,0,$pe); + } else { + $pe = $pes[$i]; + } + $annot_s = $english_annotation_ht{$english}; + $annot_s =~ s/^\s*:+//; + $annot_s =~ s/^\s+//; + $annot_s =~ s/\s+$//; + $annot_s =~ s/#.*$//; + foreach $annot (split('::', $annot_s)) { + ($slot, $value) = ($annot =~ /^([^:]+):(.*)$/); + if (defined($slot) && defined($value)) { + $pe->set($slot, $value); + } + $pe->set("sem", "integer") if ($slot eq "synt") && ($value eq "cardinal"); + } + $pe->set("ord-value", $ord_value) + if $ord_value = $english_annotation_ht{"_EN_SYNT_"}->{(lc $english)}->{"ordinal"}->{"value"}; + $pe->set("card-value", $card_value) + if $card_value = $english_annotation_ht{"_EN_SYNT_"}->{(lc $english)}->{"cardinal"}->{"value"}; + } + } + } + $i++; + } + return @pes; +} + +# builds numbers, incl. integers, decimal numbers, fractions, percentages, ordinals +sub base_number_heuristic { + local($caller, @pes) = @_; + + $i = 0; + # $ParseEntry->print_pes("start base_number_heuristic",$i,@pes); + while ($i <= $#pes) { + # forty-five + ($head_pe, @pes) = + $ParseEntry->build_parse_entry("composite number plus","",$i,*pes, + ' :head :($pe->sem eq "integer") && ($pe->value =~ /^[1-9]0$/)', + 'optional:dummy:$pe->surf eq "\@-\@"', + ' :mod :($pe->sem eq "integer") && ($pe->value =~ /^[1-9]$/)'); + if ($head_pe) { # match succeeded + $value1 = $head_pe->childValue("head"); + $value2 = $head_pe->childValue("mod"); + $head_pe->set("value", $value1 + $value2); + } + # six billion + ($head_pe, @pes) = + $ParseEntry->build_parse_entry("composite number 1000","",$i,*pes, + ' :mod :(($value1 = $pe->value) =~ /^\d+(.\d+)?$/) && ($value1 < 1000)', + ' :head:($value2 = $pe->value) =~ /^1(000)+$/'); + if ($head_pe) { # match succeeded + $value1 = $head_pe->childValue("mod"); + $value2 = $head_pe->childValue("head"); + $head_pe->set("value", $value1 * $value2); + } + # twenty-second + ($head_pe, @pes) = + $ParseEntry->build_parse_entry("composite ordinal","",$i,*pes, + ' :mod :($pe->sem eq "integer") && ($pe->value =~ /^[1-9]0$/)', + 'optional:dummy:$pe->surf eq "\@-\@"', + ' :head :$pe->get("ord-value") =~ /^[1-9]$/'); + if ($head_pe) { # match succeeded + $value1 = $head_pe->childSlot("head", "ord-value"); + $value2 = $head_pe->childValue("mod"); + $head_pe->set("value", $value1 + $value2); + } + $i++; + } + + return @pes; +} + +sub find_max_lex_match { + local($caller,$start,$maxlength,@pes) = @_; + + while ($maxlength > 0) { + if (($english = $util->pes_subseq_surf($start,$maxlength,"en",@pes)) + && defined($english_annotation_ht{$english}) + && ($english =~ /\S/)) { + return ($maxlength, $english); + } else { + $maxlength--; + } + } + return (0,""); +} + +sub select_reliable_entities { + local($caller, @pes) = @_; + + foreach $i (0 .. $#pes) { + $pe = $pes[$i]; + $surf = $pe->surf; + + $pe->set("reliable",1); + } + return @pes; +} + +sub negatives_p { + # (cool <-> uncool), (improper <-> proper), ... + local($caller, $s1, $s2) = @_; + + my $g_s1 = $util->regex_guard($s1); + my $g_s2 = $util->regex_guard($s2); + return 1 if $s1 =~ /^[iu]n$g_s2$/; + return 1 if $s1 =~ /^il$g_s2$/ && ($s2 =~ /^l/); + return 1 if $s1 =~ /^im$g_s2$/ && ($s2 =~ /^[mp]/); + + return 1 if $s2 =~ /^[iu]n$g_s1$/; + return 1 if $s2 =~ /^il$g_s1$/ && ($s1 =~ /^l/); + return 1 if $s2 =~ /^im$g_s1$/ && ($s1 =~ /^[mp]/); + + return 0; +} + +sub present_participle_p { + local($caller, $pe) = @_; + + my $aux_pe = $pe->child("aux"); + return $caller->present_participle_p($aux_pe) if $aux_pe; + my $head_pe = $pe->child("head"); + return $caller->present_participle_p($head_pe) if $head_pe; + return ($pe->synt =~ /^VBG/); +} + + +%engl_value_ht = ( + "monday" => 1, + "tuesday" => 2, + "wednesday" => 3, + "thursday" => 4, + "friday" => 5, + "saturday" => 6, + "sunday" => 7, + + "january" => 1, + "february" => 2, + "march" => 3, + "april" => 4, + "may" => 5, + "june" => 6, + "july" => 7, + "august" => 8, + "september" => 9, + "october" => 10, + "november" => 11, + "december" => 12, + + "spring" => 1, + "summer" => 2, + "fall" => 3, + "autumn" => 3, + "winter" => 4, + + "morning" => 1, + "noon" => 2, + "afternoon" => 3, + "evening" => 4, + "night" => 5, + + "picosecond" => 1, + "nanosecond" => 2, + "microsecond" => 3, + "millisecond" => 4, + "second" => 5, + "minute" => 6, + "hour" => 7, + "day" => 8, + "week" => 9, + "fortnight" => 10, + "month" => 11, + "year" => 12, + "decade" => 13, + "century" => 14, + "millennium" => 15, + + "nanometer" => 2, + "micrometer" => 3, + "millimeter" => 4, + "centimeter" => 5, + "decimeter" => 6, + "meter" => 7, + "kilometer" => 8, + "inch" => 11, + "foot" => 12, + "yard" => 13, + "mile" => 14, + "lightyear" => 20, + + "microgram" => 2, + "milligram" => 3, + "gram" => 4, + "kilogram" => 5, + "ton" => 6, + "ounce" => 14, +); + +sub engl_order_value { + local($this, $s) = @_; + + return $value = $engl_value_ht{(lc $s)} || 0; +} + +1; + diff --git a/v1.3.7/lib/NLP/SntSegmenter.pm b/v1.3.7/lib/NLP/SntSegmenter.pm new file mode 100755 index 0000000..9ae01af --- /dev/null +++ b/v1.3.7/lib/NLP/SntSegmenter.pm @@ -0,0 +1,98 @@ +################################################################ +# # +# SntSegmenter # +# # +################################################################ + +package NLP::SntSegmenter; + +# use NLP::UTF8; +# $util = NLP::utilities; + +sub segment { + local($this, $s, $lang_code) = @_; + + $s =~ s/\n/ /g; + $s =~ s/\s*$//; + return $s if $s =~ /^\s*<sec-title.*<\/sec-title>\s*$/; + # print STDERR "SEGMENT $s\n" if $s =~ /\[<xref/; #] + $reg_tag = "(?:<[^<>]*>)?"; + $lang_code = "en" unless defined($lang_code); + + # protect certain ". " by inserting \x01; + # Sen. Feinstein + foreach $_ ((1 .. 2)) { + $s =~ s/\b((?:Adm|Amb|Brig|Capt|Co|Col|Cpt|Dr|Eng|Fr|Gen|Gov|Hon|Ing|Lt|Maj|Mr|Mrs|Ms|Mt|Pfc|Pres|Pr|Prof|Rep|Rev|Sen|Sens|Sgt|Spt|St|Sup|Supt)\.)(\s+[A-Z][a-z]|\s+[A-Z]\.\s+[A-Z][a-z])/$1\x01$2/g; + } + $s =~ s/(Sgt\.)(\s+1st Class)/$1\x01$2/g; + $s =~ s/\b((?:Jan|Feb|Febr|Apr|Aug|Sep|Sept|Oct|Nov|Dec)\.)(\s+[1-9])/$1\x01$2/g; + $s =~ s/\b((?:No)\.)(\s+[1-9])/$1\x01$2/g; + # J.F. Kennedy + $s =~ s/((?:[ ()])(?:[A-Z]\.)+)(\s+(?:[A-Z]\.)+)(\s+[A-Z][a-z])/$1\x01$2\x01$3/g; # George H. + W. Bush. + $s =~ s/^([A-Z]\.)(\s+[A-Z][a-z])/$1\x01$2/g; + $s =~ s/((?:[ ()])(?:[A-Z]\.)+)(\s+[A-Z][a-z])/$1\x01$2/g; + $s =~ s/\b([A-Z]{4,} (?:[A-Z]\.)+)(\s+[A-Z]{4,})/$1\x01$2/g; # MARTIN L.C. FELDMAN + $s =~ s/^((?:[A-Z]\.)+)(\s+[A-Z]{4,})/$1\x01$2/g; # F. DENNIS + $s =~ s/([A-Z]\S*\.)(\s(?:Jr|Sr)\.\s*)$/$1\x01$2/g; # McNeil. Jr. + # et al. (2015) + $s =~ s/(\bet al$reg_tag\.$reg_tag)(\s+\(\d+\))/$1\x01$2/g; + $s =~ s/(\bet al$reg_tag\.$reg_tag)(\s+(?:19|20)\d\d)/$1\x01$2/g; + $s =~ s/(\bet al$reg_tag\.$reg_tag)(\s+[a-z])/$1\x01$2/g; + $s =~ s/(\bet al$reg_tag\.$reg_tag)(\s+\[<xref)/$1\x01$2/g; + $s =~ s/(\bet\.)(\s+al$reg_tag\.?$reg_tag\s+[a-z])/$1\x01$2/g; # bad Latin even in journals + $s =~ s/(ref$reg_tag\.$reg_tag)(\s+\[<xref)/$1\x01$2/g; + $s =~ s/((?:vs|i\.e)$reg_tag\.$reg_tag)(\s+\S)/$1\x01$2/g; + $s =~ s/\b(Fig\.)(\s+$reg_tag[0-9A-Z])/$1\x01$2/g; + $s =~ s/\b([A-Z]\.)(\s+[a-z])/$1\x01$2/g; + $s =~ s/((?:Ph\.D|M\.D)$reg_tag\.$reg_tag)(\s+\(.{1,80}\))/$1\x01$2/g; + $s =~ s/\b((?:Co|Corp|Inc|Jr|min|resp)\.)(\s+[a-z])/$1\x01$2/g; + $s =~ s/\b(E\.)(\s+coli)/$1\x01$2/g; + $s =~ s/\b(S\.)(\s+cerevisiae)/$1\x01$2/g; + $s =~ s/\b(e\.g\.|i\.e\.)(\s+[a-zA-Z])/$1\x01$2/g; + $s =~ s/\b([a-z]\.[a-z]\.$reg_tag)(\s+(?:[a-z()]|\xE2\xA9|\xE2\x89))/$1\x01$2/g; # s.c. lower-case + $s =~ s/([a-zA-Z]{2,2}[.!?]\xE2\x80\x9D)(\s+[a-z]+\s)/$1\x01$2/g; # end of embedded quote + $s =~ s/\b(U\.)(\s+S\.)/$1\x01$2/g; # split U. S. + $s =~ s/\b(U\.S\.|S\.C\.)(\s+(?:Supreme Court|Senate|Pres\.|President|Sen?s\.|Senators?|Mexico border|District Judge|Jury|Malls|Killing Spree))/$1\x01$2/ig; # U.S. Supreme Court + $s =~ s/\b(a\.m\.|p\.m\.)(\s(?:EDT|PDT)\b)/$1\x01$2/g; # 3 p.m. EDT + $s =~ s/([A-Z][a-z]+ v\.)(\s[A-Z][a-z])/$1\x01$2/g; # Bowers v. + Hardwick + $s =~ s/(\s[a-z]+\s\.\.\.)(\s(?:and|be|by)\s)/$1\x01$2/g; # enunciated ... by + foreach $i ((1 .. 2)) { + $s =~ s/(\((?:<[^<>]*>|[^()<>]){1,60}\.)(\s(?:<[^<>]*>|[^()<>]){1,60}\))/$1\x01$2/g if $i; # keep modestly short material in parentheses together (...) + } + + # break these after all + $s =~ s/\b(in (?:[Pp]anel|[Tt]able|[Ff]igure) [A-Z]\.)\x01(\s+[A-Z][a-z])/$1$2/g; + $s =~ s/([- >;\x80-\xDF][A-Z]\.)\x01(\s+(?:After|Data|Detection|Each|However|Interestingly|The|Then|This|Total)[, ])/$1$2/g; + $s =~ s/\b(s\.d\.)\x01(\(<bold>[A-Z1-9]<\/bold>\))/$1$2/g; + + $s =~ s/([.!?]) /$1\n/g; + $s =~ s/([.!?])(<\/(?:bold|i)>|"|\xE2\x80\x9D|<sup>(?:[^<>]*|<(?:xref) [^<>]*>[^<>]*<\/[a-z]+>)*<\/sup>) /$1$2\n/g; + $s =~ s/\x01//g; + + # special cases: break + $s =~ s/(\xC2\xB0[CF]\.)\s+/$1\n/g; # degree sign+[CF] + $s =~ s/(Sos-1)\s*(\(B\) Cos-7 cells were)/$1\n$2/g; + $s =~ s/(\.)\s*(To pinpoint)/$1\n$2/g; + $s =~ s/(\.)\s*(Total cellular proteins)/$1\n$2/g; + $s =~ s/(annexin V\.)\s+(Cells)/$1\n$2/g; + $s =~ s/(as in C\.)\s+(Antibodies)/$1\n$2/g; + $s =~ s/(subgroup [A-Z]\.)\s+(Second,)/$1\n$2/g; + $s =~ s/(U\.S\.)\s+(Ramadan)\b/$1\n$2/g; + + # special cases: don't break + $s =~ s/(,\s+cat\.)\s+(no?.\s+\d)/$1 $2/g; + $s =~ s/([.!?])\s+(\(PMID:\d+\)\s*)$/$1 $2/g; # headline (PMID:12345678) + $s =~ s/( models\.)\s+(such as )/$1 $2/g; # probably bad English + $s =~ s/(More recently,)\s+(\S)/$1 $2/g; # bad paragraph break + $s =~ s/(the molecules'expression\.)\s+(and <i>EGFR)/$1 $2/g; # bad English + $s =~ s/(me\?")\s+(the)/$1 $2/g; + + # special cases: glue + $s =~ s/(\xE2\x80\xB2|′|[ACGT]{4,4})(-)\s+([ACGT]{4,4})/$1$2$3/g; # DNA sequences + $s =~ s/(\xE2\x80\xB2|′|[ACGT]{4,4})(-)\s+([ACGT]{4,4})/$1$2$3/g; # DNA sequences + + return $s; +} + +1; + diff --git a/v1.3.7/lib/NLP/UTF8.pm b/v1.3.7/lib/NLP/UTF8.pm new file mode 100755 index 0000000..8488ea7 --- /dev/null +++ b/v1.3.7/lib/NLP/UTF8.pm @@ -0,0 +1,1353 @@ +################################################################ +# # +# UTF8 # +# # +################################################################ + +package NLP::UTF8; + +use NLP::utilities; +$util = NLP::utilities; + +%empty_ht = (); + +sub new { + local($caller) = @_; + + my $object = {}; + my $class = ref( $caller ) || $caller; + bless($object, $class); + return $object; +} + +sub unicode_string2string { +# input: string that might contain unicode sequences such as "U+0627" +# output: string in pure utf-8 + local($caller,$s) = @_; + + my $pre; + my $unicode; + my $post; + my $r1; + my $r2; + my $r3; + + ($pre,$unicode,$post) = ($s =~ /^(.*)(?:U\+|\\u)([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])(.*)$/); + return $s unless defined($post); + $r1 = $caller->unicode_string2string($pre); + $r2 = $caller->unicode_hex_string2string($unicode); + $r3 = $caller->unicode_string2string($post); + $result = $r1 . $r2 . $r3; + return $result; +} + +sub unicode_hex_string2string { +# input: "0627" (interpreted as hex code) +# output: utf-8 string for Arabic letter alef + local($caller,$unicode) = @_; + return "" unless defined($unicode); + my $d = hex($unicode); + return $caller->unicode2string($d); +} + +sub unicode2string { +# input: non-neg integer, e.g. 0x627 +# output: utf-8 string for Arabic letter alef + local($caller,$d) = @_; + return "" unless defined($d) && $d >= 0; + return sprintf("%c",$d) if $d <= 0x7F; + + my $lastbyte1 = ($d & 0x3F) | 0x80; + $d >>= 6; + return sprintf("%c%c",$d | 0xC0, $lastbyte1) if $d <= 0x1F; + + my $lastbyte2 = ($d & 0x3F) | 0x80; + $d >>= 6; + return sprintf("%c%c%c",$d | 0xE0, $lastbyte2, $lastbyte1) if $d <= 0xF; + + my $lastbyte3 = ($d & 0x3F) | 0x80; + $d >>= 6; + return sprintf("%c%c%c%c",$d | 0xF0, $lastbyte3, $lastbyte2, $lastbyte1) if $d <= 0x7; + + my $lastbyte4 = ($d & 0x3F) | 0x80; + $d >>= 6; + return sprintf("%c%c%c%c%c",$d | 0xF8, $lastbyte4, $lastbyte3, $lastbyte2, $lastbyte1) if $d <= 0x3; + + my $lastbyte5 = ($d & 0x3F) | 0x80; + $d >>= 6; + return sprintf("%c%c%c%c%c%c",$d | 0xFC, $lastbyte5, $lastbyte4, $lastbyte3, $lastbyte2, $lastbyte1) if $d <= 0x1; + return ""; # bad input +} + +sub html2utf8 { + local($caller, $string) = @_; + + return $string unless $string =~ /\&\#\d{3,5};/; + + my $prev = ""; + my $s = $string; + while ($s ne $prev) { + $prev = $s; + ($pre,$d,$post) = ($s =~ /^(.*)\&\#(\d+);(.*)$/); + if (defined($d) && ((($d >= 160) && ($d <= 255)) + || (($d >= 1500) && ($d <= 1699)) + || (($d >= 19968) && ($d <= 40879)))) { + $html_code = "\&\#" . $d . ";"; + $utf8_code = $caller->unicode2string($d); + $s =~ s/$html_code/$utf8_code/; + } + } + return $s; +} + +sub xhtml2utf8 { + local($caller, $string) = @_; + + return $string unless $string =~ /\&\#x[0-9a-fA-F]{2,5};/; + + my $prev = ""; + my $s = $string; + while ($s ne $prev) { + $prev = $s; + if (($pre, $html_code, $x, $post) = ($s =~ /^(.*)(\&\#x([0-9a-fA-F]{2,5});)(.*)$/)) { + $utf8_code = $caller->unicode_hex_string2string($x); + $s =~ s/$html_code/$utf8_code/; + } + } + return $s; +} + +sub utf8_marker { + return sprintf("%c%c%c\n", 0xEF, 0xBB, 0xBF); +} + +sub enforcer { +# input: string that might not conform to utf-8 +# output: string in pure utf-8, with a few "smart replacements" and possibly "?" + local($caller,$s,$no_repair) = @_; + + my $ascii; + my $utf8; + my $rest; + + return $s if $s =~ /^[\x00-\x7F]*$/; + + $no_repair = 0 unless defined($no_repair); + $orig = $s; + $result = ""; + + while ($s ne "") { + ($ascii,$rest) = ($s =~ /^([\x00-\x7F]+)(.*)$/); + if (defined($ascii)) { + $result .= $ascii; + $s = $rest; + next; + } + ($utf8,$rest) = ($s =~ /^([\xC0-\xDF][\x80-\xBF])(.*)$/); + ($utf8,$rest) = ($s =~ /^([\xE0-\xEF][\x80-\xBF][\x80-\xBF])(.*)$/) + unless defined($rest); + ($utf8,$rest) = ($s =~ /^([\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])(.*)$/) + unless defined($rest); + ($utf8,$rest) = ($s =~ /^([\xF8-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF])(.*)$/) + unless defined($rest); + if (defined($utf8)) { + $result .= $utf8; + $s = $rest; + next; + } + ($c,$rest) = ($s =~ /^(.)(.*)$/); + if (defined($c)) { + if ($no_repair) { $result .= "?"; } + elsif ($c =~ /\x85/) { $result .= "..."; } + elsif ($c =~ /\x91/) { $result .= "'"; } + elsif ($c =~ /\x92/) { $result .= "'"; } + elsif ($c =~ /\x93/) { $result .= $caller->unicode2string(0x201C); } + elsif ($c =~ /\x94/) { $result .= $caller->unicode2string(0x201D); } + elsif ($c =~ /[\xC0-\xFF]/) { + $c2 = $c; + $c2 =~ tr/[\xC0-\xFF]/[\x80-\xBF]/; + $result .= "\xC3$c2"; + } else { + $result .= "?"; + } + $s = $rest; + next; + } + $s = ""; + } + $result .= "\n" if ($orig =~ /\n$/) && ! ($result =~ /\n$/); + return $result; +} + +sub split_into_utf8_characters { +# input: utf8 string +# output: list of sub-strings, each representing a utf8 character + local($caller,$string,$group_control, *ht) = @_; + + @characters = (); + $end_of_token_p_string = ""; + $skipped_bytes = ""; + $group_control = "" unless defined($group_control); + $group_ascii_numbers = ($group_control =~ /ASCII numbers/); + $group_ascii_spaces = ($group_control =~ /ASCII spaces/); + $group_ascii_punct = ($group_control =~ /ASCII punct/); + $group_ascii_chars = ($group_control =~ /ASCII chars/); + $group_xml_chars = ($group_control =~ /XML chars/); + $group_xml_tags = ($group_control =~ /XML tags/); + $return_only_chars = ($group_control =~ /return only chars/); + $return_trailing_whitespaces = ($group_control =~ /return trailing whitespaces/); + if ($group_control =~ /ASCII all/) { + $group_ascii_numbers = 1; + $group_ascii_spaces = 1; + $group_ascii_chars = 1; + $group_ascii_punct = 1; + } + if ($group_control =~ /(XML chars and tags|XML tags and chars)/) { + $group_xml_chars = 1; + $group_xml_tags = 1; + } + $orig_string = $string; + $string .= " "; + while ($string =~ /\S/) { + # one-character UTF-8 = ASCII + if ($string =~ /^[\x00-\x7F]/) { + if ($group_xml_chars + && (($dec_unicode, $rest) = ($string =~ /^&#(\d+);(.*)$/s)) + && ($utf8_char = $caller->unicode2string($dec_unicode))) { + push(@characters, $utf8_char); + $string = $rest; + } elsif ($group_xml_chars + && (($hex_unicode, $rest) = ($string =~ /^&#x([0-9a-f]{1,6});(.*)$/is)) + && ($utf8_char = $caller->unicode_hex_string2string($hex_unicode))) { + push(@characters, $utf8_char); + $string = $rest; + } elsif ($group_xml_chars + && (($html_entity_name, $rest) = ($string =~ /^&([a-z]{1,6});(.*)$/is)) + && ($dec_unicode = $ht{HTML_ENTITY_NAME_TO_DECUNICODE}->{$html_entity_name}) + && ($utf8_char = $caller->unicode2string($dec_unicode)) + ) { + push(@characters, $utf8_char); + $string = $rest; + } elsif ($group_xml_tags + && (($tag, $rest) = ($string =~ /^(<\/?[a-zA-Z][-_:a-zA-Z0-9]*(\s+[a-zA-Z][-_:a-zA-Z0-9]*=\"[^"]*\")*\s*\/?>)(.*)$/s))) { + push(@characters, $tag); + $string = $rest; + } elsif ($group_ascii_numbers && ($string =~ /^[12]\d\d\d\.[01]?\d.[0-3]?\d([^0-9].*)?$/)) { + ($date) = ($string =~ /^(\d\d\d\d\.\d?\d.\d?\d)([^0-9].*)?$/); + push(@characters,$date); + $string = substr($string, length($date)); + } elsif ($group_ascii_numbers && ($string =~ /^\d/)) { + ($number) = ($string =~ /^(\d+(,\d\d\d)*(\.\d+)?)/); + push(@characters,$number); + $string = substr($string, length($number)); + } elsif ($group_ascii_spaces && ($string =~ /^(\s+)/)) { + ($space) = ($string =~ /^(\s+)/); + $string = substr($string, length($space)); + } elsif ($group_ascii_punct && (($punct_seq) = ($string =~ /^(-+|\.+|[:,%()"])/))) { + push(@characters,$punct_seq); + $string = substr($string, length($punct_seq)); + } elsif ($group_ascii_chars && (($word) = ($string =~ /^(\$[A-Z]*|[A-Z]{1,3}\$)/))) { + push(@characters,$word); + $string = substr($string, length($word)); + } elsif ($group_ascii_chars && (($abbrev) = ($string =~ /^((?:Jan|Feb|Febr|Mar|Apr|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec|Mr|Mrs|Dr|a.m|p.m)\.)/))) { + push(@characters,$abbrev); + $string = substr($string, length($abbrev)); + } elsif ($group_ascii_chars && (($word) = ($string =~ /^(second|minute|hour|day|week|month|year|inch|foot|yard|meter|kilometer|mile)-(?:long|old)/i))) { + push(@characters,$word); + $string = substr($string, length($word)); + } elsif ($group_ascii_chars && (($word) = ($string =~ /^(zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|million|billion|trillion)-/i))) { + push(@characters,$word); + $string = substr($string, length($word)); + } elsif ($group_ascii_chars && (($word) = ($string =~ /^([a-zA-Z]+)(?:[ ,;%?|()"]|'s |' |\. |\d+[:hms][0-9 ])/))) { + push(@characters,$word); + $string = substr($string, length($word)); + } elsif ($group_ascii_chars && ($string =~ /^([\x21-\x27\x2A-\x7E]+)/)) { # exclude () + ($ascii) = ($string =~ /^([\x21-\x27\x2A-\x7E]+)/); # ASCII black-characters + push(@characters,$ascii); + $string = substr($string, length($ascii)); + } elsif ($group_ascii_chars && ($string =~ /^([\x21-\x7E]+)/)) { + ($ascii) = ($string =~ /^([\x21-\x7E]+)/); # ASCII black-characters + push(@characters,$ascii); + $string = substr($string, length($ascii)); + } elsif ($group_ascii_chars && ($string =~ /^([\x00-\x7F]+)/)) { + ($ascii) = ($string =~ /^([\x00-\x7F]+)/); + push(@characters,$ascii); + $string = substr($string, length($ascii)); + } else { + push(@characters,substr($string, 0, 1)); + $string = substr($string, 1); + } + + # two-character UTF-8 + } elsif ($string =~ /^[\xC0-\xDF][\x80-\xBF]/) { + push(@characters,substr($string, 0, 2)); + $string = substr($string, 2); + + # three-character UTF-8 + } elsif ($string =~ /^[\xE0-\xEF][\x80-\xBF][\x80-\xBF]/) { + push(@characters,substr($string, 0, 3)); + $string = substr($string, 3); + + # four-character UTF-8 + } elsif ($string =~ /^[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF]/) { + push(@characters,substr($string, 0, 4)); + $string = substr($string, 4); + + # five-character UTF-8 + } elsif ($string =~ /^[\xF8-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]/) { + push(@characters,substr($string, 0, 5)); + $string = substr($string, 5); + + # six-character UTF-8 + } elsif ($string =~ /^[\xFC-\xFD][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]/) { + push(@characters,substr($string, 0, 6)); + $string = substr($string, 6); + + # not a UTF-8 character + } else { + $skipped_bytes .= substr($string, 0, 1); + $string = substr($string, 1); + } + + $end_of_token_p_string .= ($string =~ /^\S/) ? "0" : "1" + if $#characters >= length($end_of_token_p_string); + } + $string =~ s/ $//; # remove previously added space, but keep original spaces + if ($return_trailing_whitespaces) { + while ($string =~ /^[ \t]/) { + push(@characters,substr($string, 0, 1)); + $string = substr($string, 1); + } + push(@characters, "\n") if $orig_string =~ /\n$/; + } + return ($return_only_chars) ? @characters : ($skipped_bytes, $end_of_token_p_string, @characters); +} + +sub max_substring_info { + local($caller,$s1,$s2,$info_type) = @_; + + ($skipped_bytes1, $end_of_token_p_string1, @char_list1) = $caller->split_into_utf8_characters($s1, "", *empty_ht); + ($skipped_bytes2, $end_of_token_p_string2, @char_list2) = $caller->split_into_utf8_characters($s2, "", *empty_ht); + return 0 if $skipped_bytes1 || $skipped_bytes2; + + $best_substring_start1 = 0; + $best_substring_start2 = 0; + $best_substring_length = 0; + + foreach $start_pos2 ((0 .. $#char_list2)) { + last if $start_pos2 + $best_substring_length > $#char_list2; + foreach $start_pos1 ((0 .. $#char_list1)) { + last if $start_pos1 + $best_substring_length > $#char_list1; + $matching_length = 0; + while (($start_pos1 + $matching_length <= $#char_list1) + && ($start_pos2 + $matching_length <= $#char_list2) + && ($char_list1[$start_pos1+$matching_length] eq $char_list2[$start_pos2+$matching_length])) { + $matching_length++; + } + if ($matching_length > $best_substring_length) { + $best_substring_length = $matching_length; + $best_substring_start1 = $start_pos1; + $best_substring_start2 = $start_pos2; + } + } + } + if ($info_type =~ /^max-ratio1$/) { + $length1 = $#char_list1 + 1; + return ($length1 > 0) ? ($best_substring_length / $length1) : 0; + } elsif ($info_type =~ /^max-ratio2$/) { + $length2 = $#char_list2 + 1; + return ($length2 > 0) ? ($best_substring_length / $length2) : 0; + } elsif ($info_type =~ /^substring$/) { + return join("", @char_list1[$best_substring_start1 .. $best_substring_start1+$best_substring_length-1]); + } else { + $length1 = $#char_list1 + 1; + $length2 = $#char_list2 + 1; + $info = "s1=$s1;s2=$s2"; + $info .= ";best_substring_length=$best_substring_length"; + $info .= ";best_substring_start1=$best_substring_start1"; + $info .= ";best_substring_start2=$best_substring_start2"; + $info .= ";length1=$length1"; + $info .= ";length2=$length2"; + return $info; + } +} + +sub n_shared_chars_at_start { + local($caller,$s1,$s2) = @_; + + my $n = 0; + while (($s1 ne "") && ($s2 ne "")) { + ($c1, $rest1) = ($s1 =~ /^(.[\x80-\xBF]*)(.*)$/); + ($c2, $rest2) = ($s2 =~ /^(.[\x80-\xBF]*)(.*)$/); + if ($c1 eq $c2) { + $n++; + $s1 = $rest1; + $s2 = $rest2; + } else { + last; + } + } + return $n; +} + +sub char_length { + local($caller,$string,$byte_offset) = @_; + + my $char = ($byte_offset) ? substr($string, $byte_offset) : $string; + return 1 if $char =~ /^[\x00-\x7F]/; + return 2 if $char =~ /^[\xC0-\xDF]/; + return 3 if $char =~ /^[\xE0-\xEF]/; + return 4 if $char =~ /^[\xF0-\xF7]/; + return 5 if $char =~ /^[\xF8-\xFB]/; + return 6 if $char =~ /^[\xFC-\xFD]/; + return 0; +} + +sub length_in_utf8_chars { + local($caller,$s) = @_; + + $s =~ s/[\x80-\xBF]//g; + $s =~ s/[\x00-\x7F\xC0-\xFF]/c/g; + return length($s); +} + +sub byte_length_of_n_chars { + local($caller,$char_length,$string,$byte_offset,$undef_return_value) = @_; + + $byte_offset = 0 unless defined($byte_offset); + $undef_return_value = -1 unless defined($undef_return_value); + my $result = 0; + my $len; + foreach $i ((1 .. $char_length)) { + $len = $caller->char_length($string,($byte_offset+$result)); + return $undef_return_value unless $len; + $result += $len; + } + return $result; +} + +sub replace_non_ASCII_bytes { + local($caller,$string,$replacement) = @_; + + $replacement = "HEX" unless defined($replacement); + if ($replacement =~ /^(Unicode|U\+4|\\u|HEX)$/) { + $new_string = ""; + while (($pre,$utf8_char, $post) = ($string =~ /^([\x09\x0A\x20-\x7E]*)([\x00-\x08\x0B-\x1F\x7F]|[\xC0-\xDF][\x80-\xBF]|[\xE0-\xEF][\x80-\xBF][\x80-\xBF]|[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF]|[\xF8-\xFF][\x80-\xBF]+|[\x80-\xBF])(.*)$/s)) { + if ($replacement =~ /Unicode/) { + $new_string .= $pre . "<U" . (uc $caller->utf8_to_unicode($utf8_char)) . ">"; + } elsif ($replacement =~ /\\u/) { + $new_string .= $pre . "\\u" . (uc sprintf("%04x", $caller->utf8_to_unicode($utf8_char))); + } elsif ($replacement =~ /U\+4/) { + $new_string .= $pre . "<U+" . (uc $caller->utf8_to_4hex_unicode($utf8_char)) . ">"; + } else { + $new_string .= $pre . "<HEX-" . $caller->utf8_to_hex($utf8_char) . ">"; + } + $string = $post; + } + $new_string .= $string; + } else { + $new_string = $string; + $new_string =~ s/[\x80-\xFF]/$replacement/g; + } + return $new_string; +} + +sub valid_utf8_string_p { + local($caller,$string) = @_; + + return $string =~ /^(?:[\x09\x0A\x20-\x7E]|[\xC0-\xDF][\x80-\xBF]|[\xE0-\xEF][\x80-\xBF][\x80-\xBF]|[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])*$/; +} + +sub valid_utf8_string_incl_ascii_control_p { + local($caller,$string) = @_; + + return $string =~ /^(?:[\x00-\x7F]|[\xC0-\xDF][\x80-\xBF]|[\xE0-\xEF][\x80-\xBF][\x80-\xBF]|[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])*$/; +} + +sub utf8_to_hex { + local($caller,$s) = @_; + + $hex = ""; + foreach $i ((0 .. length($s)-1)) { + $hex .= uc sprintf("%2.2x",ord(substr($s, $i, 1))); + } + return $hex; +} + +sub utf8_to_4hex_unicode { + local($caller,$s) = @_; + + return sprintf("%4.4x", $caller->utf8_to_unicode($s)); +} + +sub utf8_to_unicode { + local($caller,$s) = @_; + + $unicode = 0; + foreach $i ((0 .. length($s)-1)) { + $c = substr($s, $i, 1); + if ($c =~ /^[\x80-\xBF]$/) { + $unicode = $unicode * 64 + (ord($c) & 0x3F); + } elsif ($c =~ /^[\xC0-\xDF]$/) { + $unicode = $unicode * 32 + (ord($c) & 0x1F); + } elsif ($c =~ /^[\xE0-\xEF]$/) { + $unicode = $unicode * 16 + (ord($c) & 0x0F); + } elsif ($c =~ /^[\xF0-\xF7]$/) { + $unicode = $unicode * 8 + (ord($c) & 0x07); + } elsif ($c =~ /^[\xF8-\xFB]$/) { + $unicode = $unicode * 4 + (ord($c) & 0x03); + } elsif ($c =~ /^[\xFC-\xFD]$/) { + $unicode = $unicode * 2 + (ord($c) & 0x01); + } + } + return $unicode; +} + +sub charhex { + local($caller,$string) = @_; + + my $result = ""; + while ($string ne "") { + $char = substr($string, 0, 1); + $string = substr($string, 1); + if ($char =~ /^[ -~]$/) { + $result .= $char; + } else { + $hex = sprintf("%2.2x",ord($char)); + $hex =~ tr/a-f/A-F/; + $result .= "<HEX-$hex>"; + } + } + return $result; +} + +sub windows1252_to_utf8 { + local($caller,$s, $norm_to_ascii_p, $preserve_potential_utf8s_p) = @_; + + return $s if $s =~ /^[\x00-\x7F]*$/; # all ASCII + + $norm_to_ascii_p = 1 unless defined($norm_to_ascii_p); + $preserve_potential_utf8s_p = 1 unless defined($preserve_potential_utf8s_p); + my $result = ""; + my $c = ""; + while ($s ne "") { + $n_bytes = 1; + if ($s =~ /^[\x00-\x7F]/) { + $result .= substr($s, 0, 1); # ASCII + } elsif ($preserve_potential_utf8s_p && ($s =~ /^[\xC0-\xDF][\x80-\xBF]/)) { + $result .= substr($s, 0, 2); # valid 2-byte UTF8 + $n_bytes = 2; + } elsif ($preserve_potential_utf8s_p && ($s =~ /^[\xE0-\xEF][\x80-\xBF][\x80-\xBF]/)) { + $result .= substr($s, 0, 3); # valid 3-byte UTF8 + $n_bytes = 3; + } elsif ($preserve_potential_utf8s_p && ($s =~ /^[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF]/)) { + $result .= substr($s, 0, 4); # valid 4-byte UTF8 + $n_bytes = 4; + } elsif ($preserve_potential_utf8s_p && ($s =~ /^[\xF8-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]/)) { + $result .= substr($s, 0, 5); # valid 5-byte UTF8 + $n_bytes = 5; + } elsif ($s =~ /^[\xA0-\xBF]/) { + $c = substr($s, 0, 1); + $result .= "\xC2$c"; + } elsif ($s =~ /^[\xC0-\xFF]/) { + $c = substr($s, 0, 1); + $c =~ tr/[\xC0-\xFF]/[\x80-\xBF]/; + $result .= "\xC3$c"; + } elsif ($s =~ /^\x80/) { + $result .= "\xE2\x82\xAC"; # Euro sign + } elsif ($s =~ /^\x82/) { + $result .= "\xE2\x80\x9A"; # single low quotation mark + } elsif ($s =~ /^\x83/) { + $result .= "\xC6\x92"; # Latin small letter f with hook + } elsif ($s =~ /^\x84/) { + $result .= "\xE2\x80\x9E"; # double low quotation mark + } elsif ($s =~ /^\x85/) { + $result .= ($norm_to_ascii_p) ? "..." : "\xE2\x80\xA6"; # horizontal ellipsis (three dots) + } elsif ($s =~ /^\x86/) { + $result .= "\xE2\x80\xA0"; # dagger + } elsif ($s =~ /^\x87/) { + $result .= "\xE2\x80\xA1"; # double dagger + } elsif ($s =~ /^\x88/) { + $result .= "\xCB\x86"; # circumflex + } elsif ($s =~ /^\x89/) { + $result .= "\xE2\x80\xB0"; # per mille sign + } elsif ($s =~ /^\x8A/) { + $result .= "\xC5\xA0"; # Latin capital letter S with caron + } elsif ($s =~ /^\x8B/) { + $result .= "\xE2\x80\xB9"; # single left-pointing angle quotation mark + } elsif ($s =~ /^\x8C/) { + $result .= "\xC5\x92"; # OE ligature + } elsif ($s =~ /^\x8E/) { + $result .= "\xC5\xBD"; # Latin capital letter Z with caron + } elsif ($s =~ /^\x91/) { + $result .= ($norm_to_ascii_p) ? "`" : "\xE2\x80\x98"; # left single quotation mark + } elsif ($s =~ /^\x92/) { + $result .= ($norm_to_ascii_p) ? "'" : "\xE2\x80\x99"; # right single quotation mark + } elsif ($s =~ /^\x93/) { + $result .= "\xE2\x80\x9C"; # left double quotation mark + } elsif ($s =~ /^\x94/) { + $result .= "\xE2\x80\x9D"; # right double quotation mark + } elsif ($s =~ /^\x95/) { + $result .= "\xE2\x80\xA2"; # bullet + } elsif ($s =~ /^\x96/) { + $result .= ($norm_to_ascii_p) ? "-" : "\xE2\x80\x93"; # n dash + } elsif ($s =~ /^\x97/) { + $result .= ($norm_to_ascii_p) ? "-" : "\xE2\x80\x94"; # m dash + } elsif ($s =~ /^\x98/) { + $result .= ($norm_to_ascii_p) ? "~" : "\xCB\x9C"; # small tilde + } elsif ($s =~ /^\x99/) { + $result .= "\xE2\x84\xA2"; # trade mark sign + } elsif ($s =~ /^\x9A/) { + $result .= "\xC5\xA1"; # Latin small letter s with caron + } elsif ($s =~ /^\x9B/) { + $result .= "\xE2\x80\xBA"; # single right-pointing angle quotation mark + } elsif ($s =~ /^\x9C/) { + $result .= "\xC5\x93"; # oe ligature + } elsif ($s =~ /^\x9E/) { + $result .= "\xC5\xBE"; # Latin small letter z with caron + } elsif ($s =~ /^\x9F/) { + $result .= "\xC5\xB8"; # Latin capital letter Y with diaeresis + } else { + $result .= "?"; + } + $s = substr($s, $n_bytes); + } + return $result; +} + +sub delete_weird_stuff { + local($caller, $s) = @_; + + # delete control chacters (except tab and linefeed), zero-width characters, byte order mark, + # directional marks, join marks, variation selectors, Arabic tatweel + $s =~ s/([\x00-\x08\x0B-\x1F\x7F]|\xC2[\x80-\x9F]|\xD9\x80|\xE2\x80[\x8B-\x8F]|\xEF\xB8[\x80-\x8F]|\xEF\xBB\xBF|\xF3\xA0[\x84-\x87][\x80-\xBF])//g; + return $s; +} + +sub number_of_utf8_character { + local($caller, $s) = @_; + + $s2 = $s; + $s2 =~ s/[\x80-\xBF]//g; + return length($s2); +} + +sub cap_letter_reg_exp { + # includes A-Z and other Latin-based capital letters with accents, umlauts and other decorations etc. + return "[A-Z]|\xC3[\x80-\x96\x98-\x9E]|\xC4[\x80\x82\x84\x86\x88\x8A\x8C\x8E\x90\x94\x964\x98\x9A\x9C\x9E\xA0\xA2\xA4\xA6\xA8\xAA\xAC\xAE\xB0\xB2\xB4\xB6\xB9\xBB\xBD\xBF]|\xC5[\x81\x83\x85\x87\x8A\x8C\x8E\x90\x92\x96\x98\x9A\x9C\x9E\xA0\xA2\xA4\xA6\xA8\xAA\xAC\xB0\xB2\xB4\xB6\xB8\xB9\xBB\xBD]"; +} + +sub regex_extended_case_expansion { + local($caller, $s) = @_; + + if ($s =~ /\xC3/) { + $s =~ s/\xC3\xA0/\xC3\[\x80\xA0\]/g; + $s =~ s/\xC3\xA1/\xC3\[\x81\xA1\]/g; + $s =~ s/\xC3\xA2/\xC3\[\x82\xA2\]/g; + $s =~ s/\xC3\xA3/\xC3\[\x83\xA3\]/g; + $s =~ s/\xC3\xA4/\xC3\[\x84\xA4\]/g; + $s =~ s/\xC3\xA5/\xC3\[\x85\xA5\]/g; + $s =~ s/\xC3\xA6/\xC3\[\x86\xA6\]/g; + $s =~ s/\xC3\xA7/\xC3\[\x87\xA7\]/g; + $s =~ s/\xC3\xA8/\xC3\[\x88\xA8\]/g; + $s =~ s/\xC3\xA9/\xC3\[\x89\xA9\]/g; + $s =~ s/\xC3\xAA/\xC3\[\x8A\xAA\]/g; + $s =~ s/\xC3\xAB/\xC3\[\x8B\xAB\]/g; + $s =~ s/\xC3\xAC/\xC3\[\x8C\xAC\]/g; + $s =~ s/\xC3\xAD/\xC3\[\x8D\xAD\]/g; + $s =~ s/\xC3\xAE/\xC3\[\x8E\xAE\]/g; + $s =~ s/\xC3\xAF/\xC3\[\x8F\xAF\]/g; + $s =~ s/\xC3\xB0/\xC3\[\x90\xB0\]/g; + $s =~ s/\xC3\xB1/\xC3\[\x91\xB1\]/g; + $s =~ s/\xC3\xB2/\xC3\[\x92\xB2\]/g; + $s =~ s/\xC3\xB3/\xC3\[\x93\xB3\]/g; + $s =~ s/\xC3\xB4/\xC3\[\x94\xB4\]/g; + $s =~ s/\xC3\xB5/\xC3\[\x95\xB5\]/g; + $s =~ s/\xC3\xB6/\xC3\[\x96\xB6\]/g; + $s =~ s/\xC3\xB8/\xC3\[\x98\xB8\]/g; + $s =~ s/\xC3\xB9/\xC3\[\x99\xB9\]/g; + $s =~ s/\xC3\xBA/\xC3\[\x9A\xBA\]/g; + $s =~ s/\xC3\xBB/\xC3\[\x9B\xBB\]/g; + $s =~ s/\xC3\xBC/\xC3\[\x9C\xBC\]/g; + $s =~ s/\xC3\xBD/\xC3\[\x9D\xBD\]/g; + $s =~ s/\xC3\xBE/\xC3\[\x9E\xBE\]/g; + } + if ($s =~ /\xC5/) { + $s =~ s/\xC5\x91/\xC5\[\x90\x91\]/g; + $s =~ s/\xC5\xA1/\xC5\[\xA0\xA1\]/g; + $s =~ s/\xC5\xB1/\xC5\[\xB0\xB1\]/g; + } + + return $s; +} + +sub extended_lower_case { + local($caller, $s) = @_; + + $s =~ tr/A-Z/a-z/; + + # Latin-1 + if ($s =~ /\xC3[\x80-\x9F]/) { + $s =~ s/À/à/g; + $s =~ s/Á/á/g; + $s =~ s/Â/â/g; + $s =~ s/Ã/ã/g; + $s =~ s/Ä/ä/g; + $s =~ s/Å/å/g; + $s =~ s/Æ/æ/g; + $s =~ s/Ç/ç/g; + $s =~ s/È/è/g; + $s =~ s/É/é/g; + $s =~ s/Ê/ê/g; + $s =~ s/Ë/ë/g; + $s =~ s/Ì/ì/g; + $s =~ s/Í/í/g; + $s =~ s/Î/î/g; + $s =~ s/Ï/ï/g; + $s =~ s/Ð/ð/g; + $s =~ s/Ñ/ñ/g; + $s =~ s/Ò/ò/g; + $s =~ s/Ó/ó/g; + $s =~ s/Ô/ô/g; + $s =~ s/Õ/õ/g; + $s =~ s/Ö/ö/g; + $s =~ s/Ø/ø/g; + $s =~ s/Ù/ù/g; + $s =~ s/Ú/ú/g; + $s =~ s/Û/û/g; + $s =~ s/Ü/ü/g; + $s =~ s/Ý/ý/g; + $s =~ s/Þ/þ/g; + } + # Latin Extended-A + if ($s =~ /[\xC4-\xC5][\x80-\xBF]/) { + $s =~ s/Ā/ā/g; + $s =~ s/Ă/ă/g; + $s =~ s/Ą/ą/g; + $s =~ s/Ć/ć/g; + $s =~ s/Ĉ/ĉ/g; + $s =~ s/Ċ/ċ/g; + $s =~ s/Č/č/g; + $s =~ s/Ď/ď/g; + $s =~ s/Đ/đ/g; + $s =~ s/Ē/ē/g; + $s =~ s/Ĕ/ĕ/g; + $s =~ s/Ė/ė/g; + $s =~ s/Ę/ę/g; + $s =~ s/Ě/ě/g; + $s =~ s/Ĝ/ĝ/g; + $s =~ s/Ğ/ğ/g; + $s =~ s/Ġ/ġ/g; + $s =~ s/Ģ/ģ/g; + $s =~ s/Ĥ/ĥ/g; + $s =~ s/Ħ/ħ/g; + $s =~ s/Ĩ/ĩ/g; + $s =~ s/Ī/ī/g; + $s =~ s/Ĭ/ĭ/g; + $s =~ s/Į/į/g; + $s =~ s/İ/ı/g; + $s =~ s/IJ/ij/g; + $s =~ s/Ĵ/ĵ/g; + $s =~ s/Ķ/ķ/g; + $s =~ s/Ĺ/ĺ/g; + $s =~ s/Ļ/ļ/g; + $s =~ s/Ľ/ľ/g; + $s =~ s/Ŀ/ŀ/g; + $s =~ s/Ł/ł/g; + $s =~ s/Ń/ń/g; + $s =~ s/Ņ/ņ/g; + $s =~ s/Ň/ň/g; + $s =~ s/Ŋ/ŋ/g; + $s =~ s/Ō/ō/g; + $s =~ s/Ŏ/ŏ/g; + $s =~ s/Ő/ő/g; + $s =~ s/Œ/œ/g; + $s =~ s/Ŕ/ŕ/g; + $s =~ s/Ŗ/ŗ/g; + $s =~ s/Ř/ř/g; + $s =~ s/Ś/ś/g; + $s =~ s/Ŝ/ŝ/g; + $s =~ s/Ş/ş/g; + $s =~ s/Š/š/g; + $s =~ s/Ţ/ţ/g; + $s =~ s/Ť/ť/g; + $s =~ s/Ŧ/ŧ/g; + $s =~ s/Ũ/ũ/g; + $s =~ s/Ū/ū/g; + $s =~ s/Ŭ/ŭ/g; + $s =~ s/Ů/ů/g; + $s =~ s/Ű/ű/g; + $s =~ s/Ų/ų/g; + $s =~ s/Ŵ/ŵ/g; + $s =~ s/Ŷ/ŷ/g; + $s =~ s/Ź/ź/g; + $s =~ s/Ż/ż/g; + $s =~ s/Ž/ž/g; + } + # Greek letters + if ($s =~ /\xCE[\x86-\xAB]/) { + $s =~ s/Α/α/g; + $s =~ s/Β/β/g; + $s =~ s/Γ/γ/g; + $s =~ s/Δ/δ/g; + $s =~ s/Ε/ε/g; + $s =~ s/Ζ/ζ/g; + $s =~ s/Η/η/g; + $s =~ s/Θ/θ/g; + $s =~ s/Ι/ι/g; + $s =~ s/Κ/κ/g; + $s =~ s/Λ/λ/g; + $s =~ s/Μ/μ/g; + $s =~ s/Ν/ν/g; + $s =~ s/Ξ/ξ/g; + $s =~ s/Ο/ο/g; + $s =~ s/Π/π/g; + $s =~ s/Ρ/ρ/g; + $s =~ s/Σ/σ/g; + $s =~ s/Τ/τ/g; + $s =~ s/Υ/υ/g; + $s =~ s/Φ/φ/g; + $s =~ s/Χ/χ/g; + $s =~ s/Ψ/ψ/g; + $s =~ s/Ω/ω/g; + $s =~ s/Ϊ/ϊ/g; + $s =~ s/Ϋ/ϋ/g; + $s =~ s/Ά/ά/g; + $s =~ s/Έ/έ/g; + $s =~ s/Ή/ή/g; + $s =~ s/Ί/ί/g; + $s =~ s/Ό/ό/g; + $s =~ s/Ύ/ύ/g; + $s =~ s/Ώ/ώ/g; + } + # Cyrillic letters + if ($s =~ /\xD0[\x80-\xAF]/) { + $s =~ s/А/а/g; + $s =~ s/Б/б/g; + $s =~ s/В/в/g; + $s =~ s/Г/г/g; + $s =~ s/Д/д/g; + $s =~ s/Е/е/g; + $s =~ s/Ж/ж/g; + $s =~ s/З/з/g; + $s =~ s/И/и/g; + $s =~ s/Й/й/g; + $s =~ s/К/к/g; + $s =~ s/Л/л/g; + $s =~ s/М/м/g; + $s =~ s/Н/н/g; + $s =~ s/О/о/g; + $s =~ s/П/п/g; + $s =~ s/Р/р/g; + $s =~ s/С/с/g; + $s =~ s/Т/т/g; + $s =~ s/У/у/g; + $s =~ s/Ф/ф/g; + $s =~ s/Х/х/g; + $s =~ s/Ц/ц/g; + $s =~ s/Ч/ч/g; + $s =~ s/Ш/ш/g; + $s =~ s/Щ/щ/g; + $s =~ s/Ъ/ъ/g; + $s =~ s/Ы/ы/g; + $s =~ s/Ь/ь/g; + $s =~ s/Э/э/g; + $s =~ s/Ю/ю/g; + $s =~ s/Я/я/g; + $s =~ s/Ѐ/ѐ/g; + $s =~ s/Ё/ё/g; + $s =~ s/Ђ/ђ/g; + $s =~ s/Ѓ/ѓ/g; + $s =~ s/Є/є/g; + $s =~ s/Ѕ/ѕ/g; + $s =~ s/І/і/g; + $s =~ s/Ї/ї/g; + $s =~ s/Ј/ј/g; + $s =~ s/Љ/љ/g; + $s =~ s/Њ/њ/g; + $s =~ s/Ћ/ћ/g; + $s =~ s/Ќ/ќ/g; + $s =~ s/Ѝ/ѝ/g; + $s =~ s/Ў/ў/g; + $s =~ s/Џ/џ/g; + } + # Fullwidth A-Z + if ($s =~ /\xEF\xBC[\xA1-\xBA]/) { + $s =~ s/A/a/g; + $s =~ s/B/b/g; + $s =~ s/C/c/g; + $s =~ s/D/d/g; + $s =~ s/E/e/g; + $s =~ s/F/f/g; + $s =~ s/G/g/g; + $s =~ s/H/h/g; + $s =~ s/I/i/g; + $s =~ s/J/j/g; + $s =~ s/K/k/g; + $s =~ s/L/l/g; + $s =~ s/M/m/g; + $s =~ s/N/n/g; + $s =~ s/O/o/g; + $s =~ s/P/p/g; + $s =~ s/Q/q/g; + $s =~ s/R/r/g; + $s =~ s/S/s/g; + $s =~ s/T/t/g; + $s =~ s/U/u/g; + $s =~ s/V/v/g; + $s =~ s/W/w/g; + $s =~ s/X/x/g; + $s =~ s/Y/y/g; + $s =~ s/Z/z/g; + } + + return $s; +} + +sub extended_upper_case { + local($caller, $s) = @_; + + $s =~ tr/a-z/A-Z/; + return $s unless $s =~ /[\xC3-\xC5][\x80-\xBF]/; + + $s =~ s/\xC3\xA0/\xC3\x80/g; + $s =~ s/\xC3\xA1/\xC3\x81/g; + $s =~ s/\xC3\xA2/\xC3\x82/g; + $s =~ s/\xC3\xA3/\xC3\x83/g; + $s =~ s/\xC3\xA4/\xC3\x84/g; + $s =~ s/\xC3\xA5/\xC3\x85/g; + $s =~ s/\xC3\xA6/\xC3\x86/g; + $s =~ s/\xC3\xA7/\xC3\x87/g; + $s =~ s/\xC3\xA8/\xC3\x88/g; + $s =~ s/\xC3\xA9/\xC3\x89/g; + $s =~ s/\xC3\xAA/\xC3\x8A/g; + $s =~ s/\xC3\xAB/\xC3\x8B/g; + $s =~ s/\xC3\xAC/\xC3\x8C/g; + $s =~ s/\xC3\xAD/\xC3\x8D/g; + $s =~ s/\xC3\xAE/\xC3\x8E/g; + $s =~ s/\xC3\xAF/\xC3\x8F/g; + $s =~ s/\xC3\xB0/\xC3\x90/g; + $s =~ s/\xC3\xB1/\xC3\x91/g; + $s =~ s/\xC3\xB2/\xC3\x92/g; + $s =~ s/\xC3\xB3/\xC3\x93/g; + $s =~ s/\xC3\xB4/\xC3\x94/g; + $s =~ s/\xC3\xB5/\xC3\x95/g; + $s =~ s/\xC3\xB6/\xC3\x96/g; + $s =~ s/\xC3\xB8/\xC3\x98/g; + $s =~ s/\xC3\xB9/\xC3\x99/g; + $s =~ s/\xC3\xBA/\xC3\x9A/g; + $s =~ s/\xC3\xBB/\xC3\x9B/g; + $s =~ s/\xC3\xBC/\xC3\x9C/g; + $s =~ s/\xC3\xBD/\xC3\x9D/g; + $s =~ s/\xC3\xBE/\xC3\x9E/g; + + $s =~ s/\xC5\x91/\xC5\x90/g; + $s =~ s/\xC5\xA1/\xC5\xA0/g; + $s =~ s/\xC5\xB1/\xC5\xB0/g; + return $s unless $s =~ /[\xC3-\xC5][\x80-\xBF]/; + + return $s; +} + +sub extended_first_upper_case { + local($caller, $s) = @_; + + if (($first_char, $rest) = ($s =~ /^([\x00-\x7F]|[\xC0-\xDF][\x80-\xBF]|[\xE0-\xEF][\x80-\xBF][\x80-\xBF])(.*)$/)) { + return $caller->extended_upper_case($first_char) . $rest; + } else { + return $s; + } +} + +sub repair_doubly_converted_utf8_strings { + local($caller, $s) = @_; + + if ($s =~ /\xC3[\x82-\x85]\xC2[\x80-\xBF]/) { + $s =~ s/\xC3\x82\xC2([\x80-\xBF])/\xC2$1/g; + $s =~ s/\xC3\x83\xC2([\x80-\xBF])/\xC3$1/g; + $s =~ s/\xC3\x84\xC2([\x80-\xBF])/\xC4$1/g; + $s =~ s/\xC3\x85\xC2([\x80-\xBF])/\xC5$1/g; + } + return $s; +} + +sub repair_misconverted_windows_to_utf8_strings { + local($caller, $s) = @_; + + # correcting conversions of UTF8 using Latin1-to-UTF converter + if ($s =~ /\xC3\xA2\xC2\x80\xC2[\x90-\xEF]/) { + my $result = ""; + while (($pre,$last_c,$post) = ($s =~ /^(.*?)\xC3\xA2\xC2\x80\xC2([\x90-\xEF])(.*)$/s)) { + $result .= "$pre\xE2\x80$last_c"; + $s = $post; + } + $result .= $s; + $s = $result; + } + # correcting conversions of Windows1252-to-UTF8 using Latin1-to-UTF converter + if ($s =~ /\xC2[\x80-\x9F]/) { + my $result = ""; + while (($pre,$c_windows,$post) = ($s =~ /^(.*?)\xC2([\x80-\x9F])(.*)$/s)) { + $c_utf8 = $caller->windows1252_to_utf8($c_windows, 0); + $result .= ($c_utf8 eq "?") ? ($pre . "\xC2" . $c_windows) : "$pre$c_utf8"; + $s = $post; + } + $result .= $s; + $s = $result; + } + if ($s =~ /\xC3/) { + $s =~ s/\xC3\xA2\xE2\x80\x9A\xC2\xAC/\xE2\x82\xAC/g; # x80 -> Euro sign + # x81 codepoint undefined in Windows 1252 + $s =~ s/\xC3\xA2\xE2\x82\xAC\xC5\xA1/\xE2\x80\x9A/g; # x82 -> single low-9 quotation mark + $s =~ s/\xC3\x86\xE2\x80\x99/\xC6\x92/g; # x83 -> Latin small letter f with hook + $s =~ s/\xC3\xA2\xE2\x82\xAC\xC5\xBE/\xE2\x80\x9E/g; # x84 -> double low-9 quotation mark + $s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xA6/\xE2\x80\xA6/g; # x85 -> horizontal ellipsis + $s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xA0/\xE2\x80\xA0/g; # x86 -> dagger + $s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xA1/\xE2\x80\xA1/g; # x87 -> double dagger + $s =~ s/\xC3\x8B\xE2\x80\xA0/\xCB\x86/g; # x88 -> modifier letter circumflex accent + $s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xB0/\xE2\x80\xB0/g; # x89 -> per mille sign + $s =~ s/\xC3\x85\xC2\xA0/\xC5\xA0/g; # x8A -> Latin capital letter S with caron + $s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xB9/\xE2\x80\xB9/g; # x8B -> single left-pointing angle quotation mark + $s =~ s/\xC3\x85\xE2\x80\x99/\xC5\x92/g; # x8C -> Latin capital ligature OE + # x8D codepoint undefined in Windows 1252 + $s =~ s/\xC3\x85\xC2\xBD/\xC5\xBD/g; # x8E -> Latin capital letter Z with caron + # x8F codepoint undefined in Windows 1252 + # x90 codepoint undefined in Windows 1252 + $s =~ s/\xC3\xA2\xE2\x82\xAC\xCB\x9C/\xE2\x80\x98/g; # x91 a-circumflex+euro+small tilde -> left single quotation mark + $s =~ s/\xC3\xA2\xE2\x82\xAC\xE2\x84\xA2/\xE2\x80\x99/g; # x92 a-circumflex+euro+trademark -> right single quotation mark + $s =~ s/\xC3\xA2\xE2\x82\xAC\xC5\x93/\xE2\x80\x9C/g; # x93 a-circumflex+euro+Latin small ligature oe -> left double quotation mark + # x94 maps through undefined intermediate code point + $s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xA2/\xE2\x80\xA2/g; # x95 a-circumflex+euro+cent sign -> bullet + $s =~ s/\xC3\xA2\xE2\x82\xAC\xE2\x80\x9C/\xE2\x80\x93/g; # x96 a-circumflex+euro+left double quotation mark -> en dash + $s =~ s/\xC3\xA2\xE2\x82\xAC\xE2\x80\x9D/\xE2\x80\x94/g; # x97 a-circumflex+euro+right double quotation mark -> em dash + $s =~ s/\xC3\x8B\xC5\x93/\xCB\x9C/g; # x98 Latin capital e diaeresis+Latin small ligature oe -> small tilde + $s =~ s/\xC3\xA2\xE2\x80\x9E\xC2\xA2/\xE2\x84\xA2/g; # x99 -> trade mark sign + $s =~ s/\xC3\x85\xC2\xA1/\xC5\xA1/g; # x9A -> Latin small letter s with caron + $s =~ s/\xC3\xA2\xE2\x82\xAC\xC2\xBA/\xE2\x80\xBA/g; # x9B -> single right-pointing angle quotation mark + $s =~ s/\xC3\x85\xE2\x80\x9C/\xC5\x93/g; # x9C -> Latin small ligature oe + # x9D codepoint undefined in Windows 1252 + $s =~ s/\xC3\x85\xC2\xBE/\xC5\xBE/g; # x9E -> Latin small letter z with caron + $s =~ s/\xC3\x85\xC2\xB8/\xC5\xB8/g; # x9F -> Latin capital letter Y with diaeresis + $s =~ s/\xC3\xAF\xC2\xBF\xC2\xBD/\xEF\xBF\xBD/g; # replacement character + } + + return $s; +} + +sub latin1_to_utf { + local($caller, $s) = @_; + + my $result = ""; + while (($pre,$c,$post) = ($s =~ /^(.*?)([\x80-\xFF])(.*)$/s)) { + $result .= $pre; + if ($c =~ /^[\x80-\xBF]$/) { + $result .= "\xC2$c"; + } elsif ($c =~ /^[\xC0-\xFF]$/) { + $c =~ tr/[\xC0-\xFF]/[\x80-\xBF]/; + $result .= "\xC3$c"; + } + $s = $post; + } + $result .= $s; + return $result; +} + +sub character_type_is_letter_type { + local($caller, $char_type) = @_; + + return ($char_type =~ /\b((CJK|hiragana|kana|katakana)\s+character|diacritic|letter|syllable)\b/); +} + +sub character_type { + local($caller, $c) = @_; + + if ($c =~ /^[\x00-\x7F]/) { + return "XML tag" if $c =~ /^<.*>$/; + return "ASCII Latin letter" if $c =~ /^[a-z]$/i; + return "ASCII digit" if $c =~ /^[0-9]$/i; + return "ASCII whitespace" if $c =~ /^[\x09-\x0D\x20]$/; + return "ASCII control-character" if $c =~ /^[\x00-\x1F\x7F]$/; + return "ASCII currency" if $c eq "\$"; + return "ASCII punctuation"; + } elsif ($c =~ /^[\xC0-\xDF]/) { + return "non-UTF8 (invalid)" unless $c =~ /^[\xC0-\xDF][\x80-\xBF]$/; + return "non-shortest-UTF8 (invalid)" if $c =~ /[\xC0-\xC1]/; + return "non-ASCII control-character" if $c =~ /\xC2[\x80-\x9F]/; + return "non-ASCII whitespace" if $c =~ /\xC2\xA0/; + return "non-ASCII currency" if $c =~ /\xC2[\xA2-\xA5]/; + return "superscript digit" if $c =~ /\xC2[\xB2\xB3\xB9]/; + return "non-ASCII Latin letter" if $c =~ /\xC2\xB5/; # micro sign + return "non-ASCII punctuation" if $c =~ /\xC2[\xA0-\xBF]/; + return "non-ASCII punctuation" if $c =~ /\xC3[\x97\xB7]/; + return "non-ASCII Latin letter" if $c =~ /\xC3[\x80-\xBF]/; + return "Latin ligature letter" if $c =~ /\xC4[\xB2\xB3]/; + return "Latin ligature letter" if $c =~ /\xC5[\x92\x93]/; + return "non-ASCII Latin letter" if $c =~ /[\xC4-\xC8]/; + return "non-ASCII Latin letter" if $c =~ /\xC9[\x80-\x8F]/; + return "IPA" if $c =~ /\xC9[\x90-\xBF]/; + return "IPA" if $c =~ /\xCA[\x80-\xBF]/; + return "IPA" if $c =~ /\xCB[\x80-\xBF]/; + return "combining-diacritic" if $c =~ /\xCC[\x80-\xBF]/; + return "combining-diacritic" if $c =~ /\xCD[\x80-\xAF]/; + return "Greek punctuation" if $c =~ /\xCD[\xBE]/; # Greek question mark + return "Greek punctuation" if $c =~ /\xCE[\x87]/; # Greek semicolon + return "Greek letter" if $c =~ /\xCD[\xB0-\xBF]/; + return "Greek letter" if $c =~ /\xCE/; + return "Greek letter" if $c =~ /\xCF[\x80-\xA1\xB3\xB7\xB8\xBA\xBB]/; + return "Coptic letter" if $c =~ /\xCF[\xA2-\xAF]/; + return "Cyrillic letter" if $c =~ /[\xD0-\xD3]/; + return "Cyrillic letter" if $c =~ /\xD4[\x80-\xAF]/; + return "Armenian punctuation" if $c =~ /\xD5[\x9A-\x9F]/; + return "Armenian punctuation" if $c =~ /\xD6[\x89-\x8F]/; + return "Armenian letter" if $c =~ /\xD4[\xB0-\xBF]/; + return "Armenian letter" if $c =~ /\xD5/; + return "Armenian letter" if $c =~ /\xD6[\x80-\x8F]/; + return "Hebrew accent" if $c =~ /\xD6[\x91-\xAE]/; + return "Hebrew punctuation" if $c =~ /\xD6\xBE/; + return "Hebrew punctuation" if $c =~ /\xD7[\x80\x83\x86\xB3\xB4]/; + return "Hebrew point" if $c =~ /\xD6[\xB0-\xBF]/; + return "Hebrew point" if $c =~ /\xD7[\x81\x82\x87]/; + return "Hebrew letter" if $c =~ /\xD7[\x90-\xB2]/; + return "other Hebrew" if $c =~ /\xD6[\x90-\xBF]/; + return "other Hebrew" if $c =~ /\xD7/; + return "Arabic currency" if $c =~ /\xDB\xB8/; # Afghani sign + return "Arabic punctuation" if $c =~ /\xD8[\x89-\x8D\x9B\x9E\x9F]/; + return "Arabic punctuation" if $c =~ /\xD9[\xAA-\xAD]/; + return "Arabic punctuation" if $c =~ /\xDB[\x94]/; + return "Arabic tatweel" if $c =~ /\xD9\x80/; + return "Arabic letter" if $c =~ /\xD8[\xA0-\xBF]/; + return "Arabic letter" if $c =~ /\xD9[\x81-\x9F]/; + return "Arabic letter" if $c =~ /\xD9[\xAE-\xBF]/; + return "Arabic letter" if $c =~ /\xDA[\x80-\xBF]/; + return "Arabic letter" if $c =~ /\xDB[\x80-\x95]/; + return "Arabic Indic digit" if $c =~ /\xD9[\xA0-\xA9]/; + return "Arabic Indic digit" if $c =~ /\xDB[\xB0-\xB9]/; + return "other Arabic" if $c =~ /[\xD8-\xDB]/; + return "Syriac punctuation" if $c =~ /\xDC[\x80-\x8F]/; + return "Syriac letter" if $c =~ /\xDC[\x90-\xAF]/; + return "Syriac diacritic" if $c =~ /\xDC[\xB0-\xBF]/; + return "Syriac diacritic" if $c =~ /\xDD[\x80-\x8A]/; + return "Thaana letter" if $c =~ /\xDE/; + } elsif ($c =~ /^[\xE0-\xEF]/) { + return "non-UTF8 (invalid)" unless $c =~ /^[\xE0-\xEF][\x80-\xBF]{2,2}$/; + return "non-shortest-UTF8 (invalid)" if $c =~ /\xE0[\x80-\x9F]/; + return "Arabic letter" if $c =~ /\xE0\xA2[\xA0-\xBF]/; # extended letters + return "other Arabic" if $c =~ /\xE0\xA3/; # extended characters + return "Devanagari punctuation" if $c =~ /\xE0\xA5[\xA4\xA5]/; # danda, double danda + return "Devanagari digit" if $c =~ /\xE0\xA5[\xA6-\xAF]/; + return "Devanagari letter" if $c =~ /\xE0[\xA4-\xA5]/; + return "Bengali digit" if $c =~ /\xE0\xA7[\xA6-\xAF]/; + return "Bengali currency" if $c =~ /\xE0\xA7[\xB2-\xB9]/; + return "Bengali letter" if $c =~ /\xE0[\xA6-\xA7]/; + return "Gurmukhi digit" if $c =~ /\xE0\xA9[\xA6-\xAF]/; + return "Gurmukhi letter" if $c =~ /\xE0[\xA8-\xA9]/; + return "Gujarati digit" if $c =~ /\xE0\xAB[\xA6-\xAF]/; + return "Gujarati letter" if $c =~ /\xE0[\xAA-\xAB]/; + return "Oriya digit" if $c =~ /\xE0\xAD[\xA6-\xAF]/; + return "Oriya digit" if $c =~ /\xE0\xAD[\xB2-\xB7]/; # fraction + return "Oriya letter" if $c =~ /\xE0[\xAC-\xAD]/; + return "Tamil digit" if $c =~ /\xE0\xAF[\xA6-\xAF]/; + return "Tamil digit" if $c =~ /\xE0\xAF[\xB0-\xB2]/; # number (10, 100, 1000) + return "Tamil letter" if $c =~ /\xE0[\xAE-\xAF]/; + return "Telegu digit" if $c =~ /\xE0\xB1[\xA6-\xAF]/; + return "Telegu digit" if $c =~ /\xE0\xB1[\xB8-\xBE]/; # fraction digit + return "Telegu letter" if $c =~ /\xE0[\xB0-\xB1]/; + return "Kannada digit" if $c =~ /\xE0\xB3[\xA6-\xAF]/; + return "Kannada letter" if $c =~ /\xE0[\xB2-\xB3]/; + return "Malayalam digit" if $c =~ /\xE0\xB5[\x98-\x9E\xA6-\xB8]/; + return "Malayalam punctuation" if $c =~ /\xE0\xB5\xB9/; # date mark + return "Malayalam letter" if $c =~ /\xE0[\xB4-\xB5]/; + return "Sinhala digit" if $c =~ /\xE0\xB7[\xA6-\xAF]/; + return "Sinhala punctuation" if $c =~ /\xE0\xB7\xB4/; + return "Sinhala letter" if $c =~ /\xE0[\xB6-\xB7]/; + return "Thai currency" if $c =~ /\xE0\xB8\xBF/; + return "Thai digit" if $c =~ /\xE0\xB9[\x90-\x99]/; + return "Thai character" if $c =~ /\xE0[\xB8-\xB9]/; + return "Lao punctuation" if $c =~ /\xE0\xBA\xAF/; # Lao ellipsis + return "Lao digit" if $c =~ /\xE0\xBB[\x90-\x99]/; + return "Lao character" if $c =~ /\xE0[\xBA-\xBB]/; + return "Tibetan punctuation" if $c =~ /\xE0\xBC[\x81-\x94]/; + return "Tibetan sign" if $c =~ /\xE0\xBC[\x95-\x9F]/; + return "Tibetan digit" if $c =~ /\xE0\xBC[\xA0-\xB3]/; + return "Tibetan punctuation" if $c =~ /\xE0\xBC[\xB4-\xBD]/; + return "Tibetan letter" if $c =~ /\xE0[\xBC-\xBF]/; + return "Myanmar digit" if $c =~ /\xE1\x81[\x80-\x89]/; + return "Myanmar digit" if $c =~ /\xE1\x82[\x90-\x99]/; # Myanmar Shan digits + return "Myanmar punctuation" if $c =~ /\xE1\x81[\x8A-\x8B]/; + return "Myanmar letter" if $c =~ /\xE1[\x80-\x81]/; + return "Myanmar letter" if $c =~ /\xE1\x82[\x80-\x9F]/; + return "Georgian punctuation" if $c =~ /\xE1\x83\xBB/; + return "Georgian letter" if $c =~ /\xE1\x82[\xA0-\xBF]/; + return "Georgian letter" if $c =~ /\xE1\x83/; + return "Georgian letter" if $c =~ /\xE1\xB2[\x90-\xBF]/; # Georgian Mtavruli capital letters + return "Georgian letter" if $c =~ /\xE2\xB4[\x80-\xAF]/; # Georgian small letters (Khutsuri) + return "Korean Hangul character" if $c =~ /\xE1[\x84-\x87]/; + return "Ethiopic punctuation" if $c =~ /\xE1\x8D[\xA0-\xA8]/; + return "Ethiopic digit" if $c =~ /\xE1\x8D[\xA9-\xBC]/; + return "Ethiopic syllable" if $c =~ /\xE1[\x88-\x8D]/; + return "Cherokee letter" if $c =~ /\xE1\x8E[\xA0-\xBF]/; + return "Cherokee letter" if $c =~ /\xE1\x8F/; + return "Canadian punctuation" if $c =~ /\xE1\x90\x80/; # Canadian Syllabics hyphen + return "Canadian punctuation" if $c =~ /\xE1\x99\xAE/; # Canadian Syllabics full stop + return "Canadian syllable" if $c =~ /\xE1[\x90-\x99]/; + return "Canadian syllable" if $c =~ /\xE1\xA2[\xB0-\xBF]/; + return "Canadian syllable" if $c =~ /\xE1\xA3/; + return "Ogham whitespace" if $c =~ /\xE1\x9A\x80/; + return "Ogham letter" if $c =~ /\xE1\x9A[\x81-\x9A]/; + return "Ogham punctuation" if $c =~ /\xE1\x9A[\x9B-\x9C]/; + return "Runic punctuation" if $c =~ /\xE1\x9B[\xAB-\xAD]/; + return "Runic letter" if $c =~ /\xE1\x9A[\xA0-\xBF]/; + return "Runic letter" if $c =~ /\xE1\x9B/; + return "Khmer currency" if $c =~ /\xE1\x9F\x9B/; + return "Khmer digit" if $c =~ /\xE1\x9F[\xA0-\xA9]/; + return "Khmer letter" if $c =~ /\xE1[\x9E-\x9F]/; + return "Mongolian punctuation" if $c =~ /\xE1\xA0[\x80-\x8A]/; + return "Mongolian digit" if $c =~ /\xE1\xA0[\x90-\x99]/; + return "Mongolian letter" if $c =~ /\xE1[\xA0-\xA1]/; + return "Mongolian letter" if $c =~ /\xE1\xA2[\x80-\xAF]/; + return "Buginese letter" if $c =~ /\xE1\xA8[\x80-\x9B]/; + return "Buginese punctuation" if $c =~ /\xE1\xA8[\x9E-\x9F]/; + return "Balinese letter" if $c =~ /\xE1\xAC/; + return "Balinese letter" if $c =~ /\xE1\xAD[\x80-\x8F]/; + return "Balinese digit" if $c =~ /\xE1\xAD[\x90-\x99]/; + return "Balinese puncutation" if $c =~ /\xE1\xAD[\x9A-\xA0]/; + return "Balinese symbol" if $c =~ /\xE1\xAD[\xA1-\xBF]/; + return "Sundanese digit" if $c =~ /\xE1\xAE[\xB0-\xB9]/; + return "Sundanese letter" if $c =~ /\xE1\xAE/; + return "Cyrillic letter" if $c =~ /\xE1\xB2[\x80-\x8F]/; + return "Sundanese punctuation" if $c =~ /\xE1\xB3[\x80-\x8F]/; + return "IPA" if $c =~ /\xE1[\xB4-\xB6]/; + return "non-ASCII Latin letter" if $c =~ /\xE1[\xB8-\xBB]/; + return "Greek letter" if $c =~ /\xE1[\xBC-\xBF]/; + return "non-ASCII whitespace" if $c =~ /\xE2\x80[\x80-\x8A]/; + return "zero-width space" if $c =~ /\xE2\x80\x8B/; + return "zero-width non-space" if $c =~ /\xE2\x80\x8C/; + return "zero-width joiner" if $c =~ /\xE2\x80\x8D/; + return "directional mark" if $c =~ /\xE2\x80[\x8E-\x8F]/; + return "non-ASCII punctuation" if $c =~ /\xE2\x80[\x90-\xBF]/; + return "non-ASCII punctuation" if $c =~ /\xE2\x81[\x80-\x9E]/; + return "superscript letter" if $c =~ /\xE2\x81[\xB1\xBF]/; + return "superscript digit" if $c =~ /\xE2\x81[\xB0-\xB9]/; + return "superscript punctuation" if $c =~ /\xE2\x81[\xBA-\xBE]/; + return "subscript digit" if $c =~ /\xE2\x82[\x80-\x89]/; + return "subscript punctuation" if $c =~ /\xE2\x82[\x8A-\x8E]/; + return "non-ASCII currency" if $c =~ /\xE2\x82[\xA0-\xBF]/; + return "letterlike symbol" if $c =~ /\xE2\x84/; + return "letterlike symbol" if $c =~ /\xE2\x85[\x80-\x8F]/; + return "arrow symbol" if $c =~ /\xE2\x86[\x90-\xBF]/; + return "arrow symbol" if $c =~ /\xE2\x87/; + return "mathematical operator" if $c =~ /\xE2[\x88-\x8B]/; + return "enclosed alphanumeric" if $c =~ /\xE2\x91[\xA0-\xBF]/; + return "enclosed alphanumeric" if $c =~ /\xE2[\x92-\x93]/; + return "box drawing" if $c =~ /\xE2[\x94-\x95]/; + return "geometric shape" if $c =~ /\xE2\x96[\xA0-\xBF]/; + return "geometric shape" if $c =~ /\xE2\x97/; + return "pictograph" if $c =~ /\xE2[\x98-\x9E]/; + return "Coptic digit" if $c =~ /\xE2\xB3\xBD/; # fraction 1/2 + return "Coptic punctuation" if $c =~ /\xE2\xB3[\xB9-\xBF]/; + return "Coptic letter" if $c =~ /\xE2[\xB2-\xB3]/; + return "Georgian letter" if $c =~ /\xE2\xB4[\x80-\xAF]/; + return "Tifinagh punctuation" if $c =~ /\xE2\xB5\xB0/; + return "Tifinagh letter" if $c =~ /\xE2\xB4[\xB0-\xBF]/; + return "Tifinagh letter" if $c =~ /\xE2\xB5/; + return "Ethiopic syllable" if $c =~ /\xE2\xB6/; + return "Ethiopic syllable" if $c =~ /\xE2\xB7[\x80-\x9F]/; + return "non-ASCII punctuation" if $c =~ /\xE3\x80[\x80-\xA0]/; + return "Japanese kana character" if $c =~ /\xE3[\x81-\x83]/; + return "Bopomofo letter" if $c =~ /\xE3\x84[\x80-\xAF]/; + return "Korean Hangul letter" if $c =~ /\xE3\x84[\xB0-\xBF]/; + return "Korean Hangul letter" if $c =~ /\xE3\x85/; + return "Korean Hangul letter" if $c =~ /\xE3\x86[\x80-\x8F]/; + return "Bopomofo letter" if $c =~ /\xE3\x86[\xA0-\xBF]/; + return "CJK stroke" if $c =~ /\xE3\x87[\x80-\xAF]/; + return "Japanese kana character" if $c =~ /\xE3\x87[\xB0-\xBF]/; + return "CJK square Latin abbreviation" if $c =~ /\xE3\x8D[\xB1-\xBA]/; + return "CJK square Latin abbreviation" if $c =~ /\xE3\x8E/; + return "CJK square Latin abbreviation" if $c =~ /\xE3\x8F[\x80-\x9F\xBF]/; + return "CJK character" if $c =~ /\xE4[\xB8-\xBF]/; + return "CJK character" if $c =~ /[\xE5-\xE9]/; + return "Yi syllable" if $c =~ /\xEA[\x80-\x92]/; + return "Lisu letter" if $c =~ /\xEA\x93[\x90-\xBD]/; + return "Lisu punctuation" if $c =~ /\xEA\x93[\xBE-\xBF]/; + return "Cyrillic letter" if $c =~ /\xEA\x99/; + return "Cyrillic letter" if $c =~ /\xEA\x9A[\x80-\x9F]/; + return "modifier tone" if $c =~ /\xEA\x9C[\x80-\xA1]/; + return "Javanese punctuation" if $c =~ /\xEA\xA7[\x81-\x8D\x9E-\x9F]/; + return "Javanese digit" if $c =~ /\xEA\xA7[\x90-\x99]/; + return "Javanese letter" if $c =~ /\xEA\xA6/; + return "Javanese letter" if $c =~ /\xEA\xA7[\x80-\x9F]/; + return "Ethiopic syllable" if $c =~ /\xEA\xAC[\x80-\xAF]/; + return "Cherokee letter" if $c =~ /\xEA\xAD[\xB0-\xBF]/; + return "Cherokee letter" if $c =~ /\xEA\xAE/; + return "Meetai Mayek digit" if $c =~ /\xEA\xAF[\xB0-\xB9]/; + return "Meetai Mayek letter" if $c =~ /\xEA\xAF/; + return "Korean Hangul character" if $c =~ /\xEA[\xB0-\xBF]/; + return "Korean Hangul character" if $c =~ /[\xEB-\xEC]/; + return "Korean Hangul character" if $c =~ /\xED[\x80-\x9E]/; + return "Klingon letter" if $c =~ /\xEF\xA3[\x90-\xA9]/; + return "Klingon digit" if $c =~ /\xEF\xA3[\xB0-\xB9]/; + return "Klingon punctuation" if $c =~ /\xEF\xA3[\xBD-\xBE]/; + return "Klingon symbol" if $c =~ /\xEF\xA3\xBF/; + return "private use character" if $c =~ /\xEE/; + return "Latin typographic ligature" if $c =~ /\xEF\xAC[\x80-\x86]/; + return "Arabic presentation letter" if $c =~ /\xEF\xAD[\x90-\xBF]/; + return "Arabic presentation letter" if $c =~ /\xEF[\xAE-\xB7]/; + return "non-ASCII punctuation" if $c =~ /\xEF\xB8[\x90-\x99]/; + return "non-ASCII punctuation" if $c =~ /\xEF\xB8[\xB0-\xBF]/; + return "non-ASCII punctuation" if $c =~ /\xEF\xB9[\x80-\xAB]/; + return "Arabic presentation letter" if $c =~ /\xEF\xB9[\xB0-\xBF]/; + return "Arabic presentation letter" if $c =~ /\xEF\xBA/; + return "Arabic presentation letter" if $c =~ /\xEF\xBB[\x80-\xBC]/; + return "byte-order mark/zero-width no-break space" if $c eq "\xEF\xBB\xBF"; + return "fullwidth currency" if $c =~ /\xEF\xBC\x84/; + return "fullwidth digit" if $c =~ /\xEF\xBC[\x90-\x99]/; + return "fullwidth Latin letter" if $c =~ /\xEF\xBC[\xA1-\xBA]/; + return "fullwidth Latin letter" if $c =~ /\xEF\xBD[\x81-\x9A]/; + return "fullwidth punctuation" if $c =~ /\xEF\xBC/; + return "fullwidth punctuation" if $c =~ /\xEF\xBD[\x9B-\xA4]/; + return "halfwidth Japanese punctuation" if $c =~ /\xEF\xBD[\xA1-\xA4]/; + return "halfwidth Japanese katakana character" if $c =~ /\xEF\xBD[\xA5-\xBF]/; + return "halfwidth Japanese katakana character" if $c =~ /\xEF\xBE[\x80-\x9F]/; + return "fullwidth currency" if $c =~ /\xEF\xBF[\xA0-\xA6]/; + return "replacement character" if $c eq "\xEF\xBF\xBD"; + } elsif ($c =~ /[\xF0-\xF7]/) { + return "non-UTF8 (invalid)" unless $c =~ /[\xF0-\xF7][\x80-\xBF]{3,3}$/; + return "non-shortest-UTF8 (invalid)" if $c =~ /\xF0[\x80-\x8F]/; + return "Linear B syllable" if $c =~ /\xF0\x90\x80/; + return "Linear B syllable" if $c =~ /\xF0\x90\x81[\x80-\x8F]/; + return "Linear B symbol" if $c =~ /\xF0\x90\x81[\x90-\x9F]/; + return "Linear B ideogram" if $c =~ /\xF0\x90[\x82-\x83]/; + return "Gothic letter" if $c =~ /\xF0\x90\x8C[\xB0-\xBF]/; + return "Gothic letter" if $c =~ /\xF0\x90\x8D[\x80-\x8F]/; + return "Phoenician letter" if $c =~ /\xF0\x90\xA4[\x80-\x95]/; + return "Phoenician digit" if $c =~ /\xF0\x90\xA4[\x96-\x9B]/; # number + return "Phoenician punctuation" if $c =~ /\xF0\x90\xA4\x9F/; # word separator + return "Old Hungarian digit" if $c =~ /\xF0\x90\xB3[\xBA-\xBF]/; # number + return "Old Hungarian letter" if $c =~ /\xF0\x90[\xB2-\xB3]/; + return "Cuneiform digit" if $c =~ /\xF0\x92\x90/; # numberic sign + return "Cuneiform digit" if $c =~ /\xF0\x92\x91[\x80-\xAF]/; # numberic sign + return "Cuneiform punctuation" if $c =~ /\xF0\x92\x91[\xB0-\xBF]/; + return "Cuneiform sign" if $c =~ /\xF0\x92[\x80-\x95]/; + return "Egyptian hieroglyph" if $c =~ /\xF0\x93[\x80-\x90]/; + return "enclosed alphanumeric" if $c =~ /\xF0\x9F[\x84-\x87]/; + return "pictograph" if $c =~ /\xF0\x9F[\x8C-\xA7]/; + return "CJK character" if $c =~ /\xF0[\xA0-\xAF]/; + return "variation selector" if $c =~ /\xF3\xA0[\x84-\x87]/; + return "private use character" if $c =~ /\xF3[\xB0-\xBF]/; + return "private use character" if $c =~ /\xF4[\x80-\x8F]/; + # ... + } elsif ($c =~ /[\xF8-\xFB]/) { + return "non-UTF8 (invalid)" unless $c =~ /[\xF8-\xFB][\x80-\xBF]{4,4}$/; + } elsif ($c =~ /[\xFC-\xFD]/) { + return "non-UTF8 (invalid)" unless $c =~ /[\xFC-\xFD][\x80-\xBF]{5,5}$/; + } elsif ($c =~ /\xFE/) { + return "non-UTF8 (invalid)" unless $c =~ /\xFE][\x80-\xBF]{6,6}$/; + } else { + return "non-UTF8 (invalid)"; + } + return "other character"; +} + +1; + + diff --git a/v1.3.7/lib/NLP/utilities.pm b/v1.3.7/lib/NLP/utilities.pm new file mode 100755 index 0000000..6f355f6 --- /dev/null +++ b/v1.3.7/lib/NLP/utilities.pm @@ -0,0 +1,3608 @@ +################################################################ +# # +# utilities # +# # +################################################################ + +package NLP::utilities; + +use File::Spec; +use Time::HiRes qw(time); +use Time::Local; +use NLP::English; +use NLP::UTF8; + +$utf8 = NLP::UTF8; +$englishPM = NLP::English; + +%empty_ht = (); + +use constant DEBUGGING => 0; + +sub member { + local($this,$elem,@array) = @_; + + my $a; + if (defined($elem)) { + foreach $a (@array) { + if (defined($a)) { + return 1 if $elem eq $a; + } else { + $DB::single = 1; # debugger breakpoint + print STDERR "\nWarning: Undefined variable utilities::member::a\n"; + } + } + } else { + $DB::single = 1; # debugger breakpoint + print STDERR "\nWarning: Undefined variable utilities::member::elem\n"; + } + return 0; +} + +sub dual_member { + local($this,$elem1,$elem2,*array1,*array2) = @_; + # returns 1 if there exists a position $n + # such that $elem1 occurs at position $n in @array1 + # and $elem2 occurs at same position $n in @array2 + + return 0 unless defined($elem1) && defined($elem2); + my $last_index = ($#array1 < $#array2) ? $#array1 : $#array2; #min + my $a; + my $b; + foreach $i ((0 .. $last_index)) { + return 1 if defined($a = $array1[$i]) && defined($b = $array2[$i]) && ($a eq $elem1) && ($b eq $elem2); + } + return 0; +} + +sub sorted_list_equal { + local($this,*list1,*list2) = @_; + + return 0 unless $#list1 == $#list2; + foreach $i ((0 .. $#list1)) { + return 0 unless $list1[$i] eq $list2[$i]; + } + return 1; +} + +sub trim { + local($this, $s) = @_; + + $s =~ s/^\s*//; + $s =~ s/\s*$//; + $s =~ s/\s+/ /g; + return $s; +} + +sub trim2 { + local($this, $s) = @_; + + $s =~ s/^\s*//; + $s =~ s/\s*$//; + return $s; +} + +sub trim_left { + local($this, $s) = @_; + $s =~ s/^\s*//; + return $s; +} + +sub cap_member { + local($this,$elem,@array) = @_; + + my $a; + my $lc_elem = lc $elem; + foreach $a (@array) { + return $a if $lc_elem eq lc $a; + } + return ""; +} + +sub remove_elem { + local($this,$elem,@array) = @_; + + return @array unless $this->member($elem, @array); + @rm_list = (); + foreach $a (@array) { + push(@rm_list, $a) unless $elem eq $a; + } + return @rm_list; +} + +sub intersect_p { + local($this,*list1,*list2) = @_; + + foreach $elem1 (@list1) { + if (defined($elem1)) { + foreach $elem2 (@list2) { + if (defined($elem2)) { + return 1 if $elem1 eq $elem2; + } else { + $DB::single = 1; # debugger breakpoint + print STDERR "\nWarning: Undefined variable utilities::intersect_p::elem2\n"; + } + } + } else { + $DB::single = 1; # debugger breakpoint + print STDERR "\nWarning: Undefined variable utilities::intersect_p::elem1\n"; + } + } + return 0; +} + +sub intersect_expl_p { + local($this,*list1,@list2) = @_; + + foreach $elem1 (@list1) { + foreach $elem2 (@list2) { + return 1 if $elem1 eq $elem2; + } + } + return 0; +} + +sub intersection { + local($this,*list1,*list2) = @_; + + @intersection_list = (); + foreach $elem1 (@list1) { + foreach $elem2 (@list2) { + push(@intersection_list, $elem1) if ($elem1 eq $elem2) && ! $this->member($elem1, @intersection_list); + } + } + return @intersection_list; +} + +sub cap_intersect_p { + local($this,*list1,*list2) = @_; + + foreach $elem1 (@list1) { + $lc_elem1 = lc $elem1; + foreach $elem2 (@list2) { + return 1 if $lc_elem1 eq lc $elem2; + } + } + return 0; +} + +sub subset_p { + local($this,*list1,*list2) = @_; + + foreach $elem1 (@list1) { + return 0 unless $this->member($elem1, @list2); + } + return 1; +} + +sub cap_subset_p { + local($this,*list1,*list2) = @_; + + foreach $elem1 (@list1) { + return 0 unless $this->cap_member($elem1, @list2); + } + return 1; +} + +sub unique { + local($this, @list) = @_; + + my %seen = (); + @uniq = (); + foreach $item (@list) { + push(@uniq, $item) unless $seen{$item}++; + } + return @uniq; +} + +sub position { + local($this,$elem,@array) = @_; + $i = 0; + foreach $a (@array) { + return $i if $elem eq $a; + $i++; + } + return -1; +} + +sub positions { + local($this,$elem,@array) = @_; + $i = 0; + @positions_in_list = (); + foreach $a (@array) { + push(@positions_in_list, $i) if $elem eq $a; + $i++; + } + return @positions_in_list; +} + +sub last_position { + local($this,$elem,@array) = @_; + + $result = -1; + $i = 0; + foreach $a (@array) { + $result = $i if $elem eq $a; + $i++; + } + return $result; +} + +sub rand_n_digit_number { + local($this,$n) = @_; + + return 0 unless $n =~ /^[1-9]\d*$/; + $ten_power_n = 10 ** ($n - 1); + return int(rand(9 * $ten_power_n)) + $ten_power_n; +} + +# Consider File::Temp +sub new_tmp_filename { + local($this,$filename) = @_; + + $loop_limit = 1000; + ($dir,$simple_filename) = ($filename =~ /^(.+)\/([^\/]+)$/); + $simple_filename = $filename unless defined($simple_filename); + $new_filename = "$dir/tmp-" . $this->rand_n_digit_number(8) . "-$simple_filename"; + while ((-e $new_filename) && ($loop_limit-- >= 0)) { + $new_filename = "$dir/tmp-" . $this->rand_n_digit_number(8) . "-$simple_filename"; + } + return $new_filename; +} + +# support sorting order: "8", "8.0", "8.5", "8.5.1.", "8.10", "10", "10-12" + +sub compare_complex_numeric { + local($this,$a,$b) = @_; + + (my $a_num,my $a_rest) = ($a =~ /^(\d+)\D*(.*)$/); + (my $b_num,my $b_rest) = ($b =~ /^(\d+)\D*(.*)$/); + + if (defined($a_rest) && defined($b_rest)) { + return ($a_num <=> $b_num) + || $this->compare_complex_numeric($a_rest,$b_rest); + } else { + return $a cmp $b; + } +} + +# support sorting order: "lesson8-ps-v1.9.xml", "Lesson 10_ps-v_1.11.xml" +# approach: segment strings into alphabetic and numerical sections and compare pairwise + +sub compare_mixed_alpha_numeric { + local($this,$a,$b) = @_; + + ($a_alpha,$a_num,$a_rest) = ($a =~ /^(\D*)(\d[-\d\.]*)(.*)$/); + ($b_alpha,$b_num,$b_rest) = ($b =~ /^(\D*)(\d[-\d\.]*)(.*)$/); + + ($a_alpha) = ($a =~ /^(\D*)/) unless defined $a_alpha; + ($b_alpha) = ($b =~ /^(\D*)/) unless defined $b_alpha; + + # ignore non-alphabetic characters in alpha sections + $a_alpha =~ s/\W|_//g; + $b_alpha =~ s/\W|_//g; + + if ($alpha_cmp = lc $a_alpha cmp lc $b_alpha) { + return $alpha_cmp; + } elsif (defined($a_rest) && defined($b_rest)) { + return $this->compare_complex_numeric($a_num,$b_num) + || $this->compare_mixed_alpha_numeric ($a_rest,$b_rest); + } else { + return (defined($a_num) <=> defined($b_num)) || ($a cmp $b); + } +} + +# @sorted_lessons = sort { NLP::utilities->compare_mixed_alpha_numeric($a,$b) } @lessons; + +sub html_guarded_p { + local($this,$string) = @_; + + return 0 if $string =~ /[<>"]/; + @segs = split('&',$string); + shift @segs; + foreach $seg (@segs) { + next if $seg =~ /^amp;/; + next if $seg =~ /^quot;/; + next if $seg =~ /^nbsp;/; + next if $seg =~ /^gt;/; + next if $seg =~ /^lt;/; + next if $seg =~ /^#(\d+);/; + next if $seg =~ /^#x([0-9a-fA-F]+);/; + return 0; + } + return 1; +} + +sub guard_tooltip_text { + local($this,$string) = @_; + + $string =~ s/\xCB\x88/'/g; + return $string; +} + +sub guard_html { + local($this,$string,$control_string) = @_; + + return "" unless defined($string); + my $guarded_string; + $control_string = "" unless defined($control_string); + return $string if ($string =~ /&/) + && (! ($control_string =~ /\bstrict\b/)) + && $this->html_guarded_p($string); + $guarded_string = $string; + $guarded_string =~ s/&/&/g; + if ($control_string =~ /slash quote/) { + $guarded_string =~ s/"/\\"/g; + } elsif ($control_string =~ /keep quote/) { + } else { + $guarded_string =~ s/\"/"/g; + } + if ($control_string =~ /escape-slash/) { + $guarded_string =~ s/\//&x2F;/g; + } + $guarded_string =~ s/>/>/g; + $guarded_string =~ s/</</g; + return $guarded_string; +} + +sub unguard_html { + local($this,$string) = @_; + + return undef unless defined($string); + $string=~ s[&(\S*?);]{ + local $_ = $1; + /^amp$/i ? "&" : + /^quot$/i ? '"' : + /^apos$/i ? "'" : + /^gt$/i ? ">" : + /^lt$/i ? "<" : + /^x2F$/i ? "/" : + /^nbsp$/i ? "\xC2\xA0" : + /^#(\d+)$/ ? $this->chr($1) : + /^#x([0-9a-f]+)$/i ? $this->chr(hex($1)) : + $_ + }gex; + return $string; +} + +sub unguard_html_r { + local($this,$string) = @_; + + return undef unless defined($string); + + $string =~ s/&/&/g; + $string =~ s/"/'/g; + $string =~ s/</</g; + $string =~ s/>/>/g; + + ($d) = ($string =~ /&#(\d+);/); + while (defined($d)) { + $c = $this->chr($d); + $string =~ s/&#$d;/$c/g; + ($d) = ($string =~ /&#(\d+);/); + } + ($x) = ($string =~ /&#x([0-9a-f]+);/i); + while (defined($x)) { + $c = $this->chr(hex($x)); + $string =~ s/&#x$x;/$c/g; + ($x) = ($string =~ /&#x([0-9a-f]+);/i); + } + $string0 = $string; + ($x) = ($string =~ /(?:https?|www|\.com)\S*\%([0-9a-f]{2,2})/i); + while (defined($x)) { + $c = $this->chr("%" . hex($x)); + $string =~ s/\%$x/$c/g; + ($x) = ($string =~ /(?:https?|www|\.com)\S*\%([0-9a-f]{2,2})/i); + } + return $string; +} + +sub unguard_html_l { + local($caller,$string) = @_; + + return undef unless defined($string); + + my $pre; + my $core; + my $post; + my $repl; + my $s = $string; + if (($pre,$core,$post) = ($s =~ /^(.*)&(amp|quot|lt|gt|#\d+|#x[0-9a-f]+);(.*)$/i)) { + $repl = "?"; + $repl = "&" if $core =~ /^amp$/i; + $repl = "'" if $core =~ /^quot$/i; + $repl = "<" if $core =~ /^lt$/i; + $repl = ">" if $core =~ /^gt$/i; + if ($core =~ /^#\d+$/i) { + $core2 = substr($core,1); + $repl = $caller->chr($core2); + } + $repl = $caller->chr(hex(substr($core,2))) if $core =~ /^#x[0-9a-f]+$/i; + $s = $pre . $repl . $post; + } + return $s; +} + +sub guard_html_quote { + local($caller,$string) = @_; + + $string =~ s/"/"/g; + return $string; +} + +sub unguard_html_quote { + local($caller,$string) = @_; + + $string =~ s/"/"/g; + return $string; +} + +sub uri_encode { + local($caller,$string) = @_; + + $string =~ s/([^^A-Za-z0-9\-_.!~*()'])/ sprintf "%%%02x", ord $1 /eg; + return $string; +} + +sub uri_decode { + local($caller,$string) = @_; + + $string =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; + return $string; +} + +sub remove_xml_tags { + local($caller,$string) = @_; + + $string =~ s/<\/?[a-zA-Z][-_:a-zA-Z0-9]*(\s+[a-zA-Z][-_:a-zA-Z0-9]*=\"[^"]*\")*\s*\/?>//g; + return $string; +} + +sub remove_any_tokenization_at_signs_around_xml_tags { + local($caller,$string) = @_; + + $string =~ s/(?:\@ \@)?(<[^<>]+>)(?:\@ \@)?/$1/g; + $string =~ s/\@?(<[^<>]+>)\@?/$1/g; + return $string; +} + +sub remove_xml_tags_and_any_bordering_at_signs { + # at-signs from tokenization + local($caller,$string) = @_; + + $string =~ s/\@?<\/?[a-zA-Z][-_:a-zA-Z0-9]*(\s+[a-zA-Z][-_:a-zA-Z0-9]*=\"[^"]*\")*\s*\/?>\@?//g; + return $string; +} + +sub chr { + local($caller,$i) = @_; + + return undef unless $i =~ /^\%?\d+$/; + if ($i =~ /^%/) { + $i =~ s/^\%//; + return chr($i) if $i < 128; + return "\x80" | chr($i - 128) if $i < 256; + } else { + return chr($i) if $i < 128; + return ("\xC0" | chr(($i / 64) % 32)) + . ("\x80" | chr($i % 64)) if $i < 2048; + return ("\xE0" | chr(int($i / 4096) % 16)) + . ("\x80" | chr(int($i / 64) % 64)) + . ("\x80" | chr($i % 64)) if $i < 65536; + return ("\xF0" | chr(int($i / 262144) % 8)) + . ("\x80" | chr(int($i / 4096) % 64)) + . ("\x80" | chr(int($i / 64) % 64)) + . ("\x80" | chr($i % 64)) if $i < 2097152; + } + return "?"; +} + +sub guard_cgi { + local($caller, $string) = @_; + + $guarded_string = $string; + if ($string =~ /[\x80-\xFF]/) { + $guarded_string = ""; + while ($string ne "") { + $char = substr($string, 0, 1); + $string = substr($string, 1); + if ($char =~ /^[\\ ;\#\&\:\=\"\'\+\?\x00-\x1F\x80-\xFF]$/) { + $hex = sprintf("%2.2x",ord($char)); + $guarded_string .= uc "%$hex"; + } else { + $guarded_string .= $char; + } + } + } else { + $guarded_string = $string; + $guarded_string =~ s/%/%25/g; + $guarded_string =~ s/\n/%5Cn/g; + $guarded_string =~ s/\t/%5Ct/g; + $guarded_string =~ s/ /%20/g; + $guarded_string =~ s/"/%22/g; + $guarded_string =~ s/#/%23/g; + $guarded_string =~ s/&/%26/g; + $guarded_string =~ s/'/%27/g; + $guarded_string =~ s/\+/%2B/g; + $guarded_string =~ s/\//%2F/g; + $guarded_string =~ s/:/%3A/g; + $guarded_string =~ s/;/%3B/g; + $guarded_string =~ s/</%3C/g; + $guarded_string =~ s/=/%3D/g; + $guarded_string =~ s/>/%3E/g; + $guarded_string =~ s/\?/%3F/g; + } + return $guarded_string; +} + +sub repair_cgi_guard { + local($caller,$string) = @_; + # undo second cgi-guard, e.g. "Jo%25C3%25ABlle_Aubron" -> "Jo%C3%ABlle_Aubron" + + $string =~ s/(%)25([CD][0-9A-F]%)25([89AB][0-9A-F])/$1$2$3/g; + $string =~ s/(%)25(E[0-9A-F]%)25([89AB][0-9A-F]%)25([89AB][0-9A-F])/$1$2$3$4/g; + return $string; +} + +sub unguard_cgi { + local($caller,$string) = @_; + + $unguarded_string = $string; + $unguarded_string =~ s/%5Cn/\n/g; + $unguarded_string =~ s/%5Ct/\t/g; + $unguarded_string =~ s/%20/ /g; + $unguarded_string =~ s/%23/#/g; + $unguarded_string =~ s/%26/&/g; + $unguarded_string =~ s/%2B/+/g; + $unguarded_string =~ s/%2C/,/g; + $unguarded_string =~ s/%3A/:/g; + $unguarded_string =~ s/%3D/=/g; + $unguarded_string =~ s/%3F/?/g; + $unguarded_string =~ s/%C3%A9/\xC3\xA9/g; + + # more general + ($code) = ($unguarded_string =~ /%([0-9A-F]{2,2})/); + while (defined($code)) { + $percent_code = "%" . $code; + $hex_code = sprintf("%c", hex($code)); + $unguarded_string =~ s/$percent_code/$hex_code/g; + ($code) = ($unguarded_string =~ /%([0-9A-F]{2,2})/); + } + + return $unguarded_string; +} + +sub regex_guard { + local($caller,$string) = @_; + + $guarded_string = $string; + $guarded_string =~ s/([\\\/\^\|\(\)\{\}\$\@\*\+\?\.\[\]])/\\$1/g + if $guarded_string =~ /[\\\/\^\|\(\)\{\}\$\@\*\+\?\.\[\]]/; + + return $guarded_string; +} + +sub g_regex_spec_tok_p { + local($this,$string) = @_; + + # specials: ( ) (?: ) [ ] + return ($string =~ /^(\(\?:|[()\[\]])$/); +} + +sub regex_guard_norm { + local($this,$string) = @_; + + return $string unless $string =~ /[\[\]\\()$@?+]/; + my $rest = $string; + my @stack = (""); + while ($rest ne "") { + # specials: ( ) (?: ) [ ] ? + + if (($pre, $special, $post) = ($rest =~ /^((?:\\.|[^\[\]()?+])*)(\(\?:|[\[\]()?+])(.*)$/)) { + # print STDERR "Special: $pre *$special* $post\n"; + unless ($pre eq "") { + push(@stack, $pre); + while (($#stack >= 1) && (! $this->g_regex_spec_tok_p($stack[$#stack-1])) + && (! $this->g_regex_spec_tok_p($stack[$#stack]))) { + $s1 = pop @stack; + $s2 = pop @stack; + push(@stack, "$s2$s1"); + } + } + if ($special =~ /^[?+]$/) { + push(@stack, "\\") if ($stack[$#stack] eq "") + || ($this->g_regex_spec_tok_p($stack[$#stack]) && ($stack[$#stack] ne "[")); + push(@stack, $special); + } elsif ($special eq "]") { + if (($#stack >= 1) && ($stack[$#stack-1] eq "[") && ! $this->g_regex_spec_tok_p($stack[$#stack])) { + $char_expression = pop @stack; + pop @stack; + push(@stack, "[$char_expression]"); + } else { + push(@stack, $special); + } + } elsif (($special =~ /^[()]/) && (($stack[$#stack] eq "[") + || (($#stack >= 1) + && ($stack[$#stack-1] eq "[") + && ! $this->g_regex_spec_tok_p($stack[$#stack])))) { + push(@stack, "\\$special"); + } elsif ($special eq ")") { + if (($#stack >= 1) && ($stack[$#stack-1] =~ /^\((\?:)?$/) && ! $this->g_regex_spec_tok_p($stack[$#stack])) { + $alt_expression = pop @stack; + $open_para = pop @stack; + if ($open_para eq "(") { + push(@stack, "(?:$alt_expression)"); + } else { + push(@stack, "$open_para$alt_expression)"); + } + } else { + push(@stack, $special); + } + } else { + push(@stack, $special); + } + while (($#stack >= 1) && (! $this->g_regex_spec_tok_p($stack[$#stack-1])) + && (! $this->g_regex_spec_tok_p($stack[$#stack]))) { + $s1 = pop @stack; + $s2 = pop @stack; + push(@stack, "$s2$s1"); + } + $rest = $post; + } else { + push(@stack, $rest); + $rest = ""; + } + } + # print STDERR "Stack: " . join(";", @stack) . "\n"; + foreach $i ((0 .. $#stack)) { + $stack_elem = $stack[$i]; + if ($stack_elem =~ /^[()\[\]]$/) { + $stack[$i] = "\\" . $stack[$i]; + } + } + return join("", @stack); +} + +sub string_guard { + local($caller,$string) = @_; + + return "" unless defined($string); + $guarded_string = $string; + $guarded_string =~ s/([\\"])/\\$1/g + if $guarded_string =~ /[\\"]/; + + return $guarded_string; +} + +sub guard_javascript_arg { + local($caller,$string) = @_; + + return "" unless defined($string); + $guarded_string = $string; + $guarded_string =~ s/\\/\\\\/g; + $guarded_string =~ s/'/\\'/g; + return $guarded_string; +} + +sub guard_substitution_right_hand_side { + # "$1x" => "$1 . \"x\"" + local($caller,$string) = @_; + + my $result = ""; + ($pre,$var,$post) = ($string =~ /^([^\$]*)(\$\d)(.*)$/); + while (defined($var)) { + $result .= " . " if $result; + $result .= "\"$pre\" . " unless $pre eq ""; + $result .= $var; + $string = $post; + ($pre,$var,$post) = ($string =~ /^([^\$]*)(\$\d)(.*)$/); + } + $result .= " . \"$string\"" if $string; + return $result; +} + +sub string_starts_with_substring { + local($caller,$string,$substring) = @_; + + $guarded_substring = $caller->regex_guard($substring); + return $string =~ /^$guarded_substring/; +} + +sub one_string_starts_with_the_other { + local($caller,$s1,$s2) = @_; + + return ($s1 eq $s2) + || $caller->string_starts_with_substring($s1,$s2) + || $caller->string_starts_with_substring($s2,$s1); +} + +sub string_ends_in_substring { + local($caller,$string,$substring) = @_; + + $guarded_substring = $caller->regex_guard($substring); + return $string =~ /$guarded_substring$/; +} + +sub string_equal_ignore_leading_multiple_or_trailing_blanks { + local($caller,$string1,$string2) = @_; + + return 1 if $string1 eq $string2; + $string1 =~ s/\s+/ /; + $string2 =~ s/\s+/ /; + $string1 =~ s/^\s+//; + $string2 =~ s/^\s+//; + $string1 =~ s/\s+$//; + $string2 =~ s/\s+$//; + + return $string1 eq $string2; +} + +sub strip_substring_from_start_of_string { + local($caller,$string,$substring,$error_code) = @_; + + $error_code = "ERROR" unless defined($error_code); + my $reg_surf = $caller->regex_guard($substring); + if ($string =~ /^$guarded_substring/) { + $string =~ s/^$reg_surf//; + return $string; + } else { + return $error_code; + } +} + +sub strip_substring_from_end_of_string { + local($caller,$string,$substring,$error_code) = @_; + + $error_code = "ERROR" unless defined($error_code); + my $reg_surf = $caller->regex_guard($substring); + if ($string =~ /$reg_surf$/) { + $string =~ s/$reg_surf$//; + return $string; + } else { + return $error_code; + } +} + +# to be deprecated +sub lang_code { + local($caller,$language) = @_; + + $langPM = NLP::Language->new(); + return $langPM->lang_code($language); +} + +sub full_language { + local($caller,$lang_code) = @_; + + return "Arabic" if $lang_code eq "ar"; + return "Chinese" if $lang_code eq "zh"; + return "Czech" if $lang_code eq "cs"; + return "Danish" if $lang_code eq "da"; + return "Dutch" if $lang_code eq "nl"; + return "English" if $lang_code eq "en"; + return "Finnish" if $lang_code eq "fi"; + return "French" if $lang_code eq "fr"; + return "German" if $lang_code eq "de"; + return "Greek" if $lang_code eq "el"; + return "Hebrew" if $lang_code eq "he"; + return "Hindi" if $lang_code eq "hi"; + return "Hungarian" if $lang_code eq "hu"; + return "Icelandic" if $lang_code eq "is"; + return "Indonesian" if $lang_code eq "id"; + return "Italian" if $lang_code eq "it"; + return "Japanese" if $lang_code eq "ja"; + return "Kinyarwanda" if $lang_code eq "rw"; + return "Korean" if $lang_code eq "ko"; + return "Latin" if $lang_code eq "la"; + return "Malagasy" if $lang_code eq "mg"; + return "Norwegian" if $lang_code eq "no"; + return "Pashto" if $lang_code eq "ps"; + return "Persian" if $lang_code eq "fa"; + return "Polish" if $lang_code eq "pl"; + return "Portuguese" if $lang_code eq "pt"; + return "Romanian" if $lang_code eq "ro"; + return "Russian" if $lang_code eq "ru"; + return "Spanish" if $lang_code eq "es"; + return "Swedish" if $lang_code eq "sv"; + return "Turkish" if $lang_code eq "tr"; + return "Urdu" if $lang_code eq "ur"; + return ""; +} + +# to be deprecated +sub short_lang_name { + local($caller,$lang_code) = @_; + + $langPM = NLP::Language->new(); + return $langPM->shortname($lang_code); +} + +sub ml_dir { + local($caller,$language,$type) = @_; + + $type = "MSB" unless defined($type); + $lang_code = $langPM->lang_code($language); + return $caller->ml_dir($lang_code, "lex") . "/corpora" if $type eq "corpora"; + return "" unless defined($rc); + $ml_home = $rc->ml_home_dir(); + return File::Spec->catfile($ml_home, "arabic") + if ($lang_code eq "ar-iq") && ! $caller->member(lc $type,"lex","onto","dict"); + $langPM = NLP::Language->new(); + $lexdir = $langPM->lexdir($lang_code); + return $lexdir if defined($lexdir); + return ""; +} + +sub language_lex_filename { + local($caller,$language,$type) = @_; + + $langPM = NLP::Language->new(); + if (($lang_code = $langPM->lang_code($language)) + && ($ml_dir = $caller->ml_dir($lang_code,$type)) + && ($norm_language = $caller->short_lang_name($lang_code))) { + return "$ml_dir/$norm_language-lex" if ($type eq "lex"); + return "$ml_dir/onto" if ($type eq "onto"); + return "$ml_dir/$norm_language-english-dict" if ($type eq "dict") && !($lang_code eq "en"); + return ""; + } else { + return ""; + } +} + +# filename_without_path is obsolete - replace with +# use File::Basename; +# basename($filename) +sub filename_without_path { + local($caller,$filename) = @_; + + $filename =~ s/^.*\/([^\/]+)$/$1/; + return $filename; +} + +sub option_string { + local($caller,$input_name,$default,*values,*labels) = @_; + + my $s = "<select id=\"$input_name\" name=\"$input_name\" size=\"1\">"; + for $i (0 .. $#values) { + my $value = $values[$i]; + my $label = $labels[$i]; + my $selected_clause = ($default eq $value) ? "selected" : ""; + $s .= "<option $selected_clause value=\"$value\">$label</option>"; + } + $s .= "</select>"; + return $s; +} + +sub pes_subseq_surf { + local($this,$start,$length,$langCode,@pes) = @_; + + my $surf = ""; + if ($start+$length-1 <= $#pes) { + foreach $i ($start .. $start + $length - 1) { + my $pe = $pes[$i]; + $surf .= $pe->get("surf",""); + $surf .= " " if $langCode =~ /^(ar|en|fr)$/; + } + } + $surf =~ s/\s+$//; + return $surf; +} + +sub copyList { + local($this,@list) = @_; + + @copy_list = (); + foreach $elem (@list) { + push(@copy_list,$elem); + } + return @copy_list; +} + +sub list_with_same_elem { + local($this,$size,$elem) = @_; + + @list = (); + foreach $i (0 .. $size-1) { + push(@list,$elem); + } + return @list; +} + +sub count_occurrences { + local($this,$s,$substring) = @_; + + $occ = 0; + $new = $s; + $guarded_substring = $this->regex_guard($substring); + $new =~ s/$guarded_substring//; + while ($new ne $s) { + $occ++; + $s = $new; + $new =~ s/$guarded_substring//; + } + return $occ; +} + +sub position_of_nth_occurrence { + local($this,$s,$substring,$occ) = @_; + + return -1 unless $occ > 0; + my $pos = 0; + while (($pos = index($s, $substring, $pos)) >= 0) { + return $pos if $occ == 1; + $occ--; + $pos = $pos + length($substring); + } + return -1; +} + +sub has_diff_elements_p { + local($this,@array) = @_; + + return 0 if $#array < 1; + $elem = $array[0]; + + foreach $a (@array) { + return 1 if $elem ne $a; + } + return 0; +} + +sub init_log { + local($this,$logfile, $control) = @_; + + $control = "" unless defined($control); + if ((DEBUGGING || ($control =~ /debug/i)) && $logfile) { + system("rm -f $logfile"); + system("date > $logfile; chmod 777 $logfile"); + } +} + +sub time_stamp_log { + local($this,$logfile, $control) = @_; + + $control = "" unless defined($control); + if ((DEBUGGING || ($control =~ /debug/i)) && $logfile) { + system("date >> $logfile; chmod 777 $logfile"); + } +} + +sub log { + local($this,$message,$logfile,$control) = @_; + + $control = "" unless defined($control); + if ((DEBUGGING || ($control =~ /debug/i)) && $logfile) { + $this->init_log($logfile, $control) unless -w $logfile; + if ($control =~ /timestamp/i) { + $this->time_stamp_log($logfile, $control); + } + $guarded_message = $message; + $guarded_message =~ s/"/\\"/g; + system("echo \"$guarded_message\" >> $logfile"); + } +} + +sub month_name_to_month_number { + local($this,$month_name) = @_; + + $month_name_init = lc substr($month_name,0,3); + return $this->position($month_name_init, "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec") + 1; +} + +my @short_month_names = ("Jan.","Febr.","March","April","May","June","July","Aug.","Sept.","Oct.","Nov.","Dec."); +my @full_month_names = ("January","February","March","April","May","June","July","August","September","October","November","December"); + +sub month_number_to_month_name { + local($this,$month_number, $control) = @_; + + $month_number =~ s/^0//; + if ($month_number =~ /^([1-9]|1[0-2])$/) { + return ($control && ($control =~ /short/i)) + ? $short_month_names[$month_number-1] + : $full_month_names[$month_number-1]; + } else { + return ""; + } +} + +sub datetime { + local($this,$format,$time_in_secs, $command) = @_; + + $command = "" unless defined($command); + $time_in_secs = time unless defined($time_in_secs) && $time_in_secs; + @time_vector = ($command =~ /\b(gm|utc)\b/i) ? gmtime($time_in_secs) : localtime($time_in_secs); + ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=@time_vector; + $thisyear = $year + 1900; + $thismon=(Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec)[$mon]; + $thismon2=("Jan.","Febr.","March","April","May","June","July","Aug.","Sept.","Oct.","Nov.","Dec.")[$mon]; + $thismonth = $mon + 1; + $thisday=(Sun,Mon,Tue,Wed,Thu,Fri,Sat)[$wday]; + $milliseconds = int(($time_in_secs - int($time_in_secs)) * 1000); + $date="$thisday $thismon $mday, $thisyear"; + $sdate="$thismon $mday, $thisyear"; + $dashedDate = sprintf("%04d-%02d-%02d",$thisyear,$thismonth,$mday); + $slashedDate = sprintf("%02d/%02d/%04d",$mday,$thismonth,$thisyear); + $time=sprintf("%02d:%02d:%02d",$hour,$min,$sec); + $shorttime=sprintf("%d:%02d",$hour,$min); + $shortdatetime = "$thismon2 $mday, $shorttime"; + + if ($date =~ /undefined/) { + return ""; + } elsif ($format eq "date at time") { + return "$date at $time"; + } elsif ($format eq "date") { + return "$date"; + } elsif ($format eq "sdate") { + return "$sdate"; + } elsif ($format eq "ddate") { + return "$dashedDate"; + } elsif ($format eq "time") { + return "$time"; + } elsif ($format eq "dateTtime+ms") { + return $dashedDate . "T" . $time . "." . $milliseconds; + } elsif ($format eq "dateTtime") { + return $dashedDate . "T" . $time; + } elsif ($format eq "short date at time") { + return $shortdatetime; + } else { + return "$date at $time"; + } +} + +sub datetime_of_last_file_modification { + local($this,$format,$filename) = @_; + + return $this->datetime($format,(stat($filename))[9]); +} + +sub add_1sec { + local($this,$datetime) = @_; + + if (($year,$month,$day,$hour,$minute,$second) = ($datetime =~ /^(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d):(\d\d)$/)) { + $second++; + if ($second >= 60) { $second -= 60; $minute++; } + if ($minute >= 60) { $minute -= 60; $hour++; } + if ($hour >= 24) { $hour -= 24; $day++; } + if ($month =~ /^(01|03|05|07|08|10|12)$/) { + if ($day > 31) { $day -= 31; $month++; } + } elsif ($month =~ /^(04|06|09|11)$/) { + if ($day > 30) { $day -= 30; $month++; } + } elsif (($month eq "02") && $this->leap_year($year)) { + if ($day > 29) { $day -= 29; $month++; } + } elsif ($month eq "02") { + if ($day > 28) { $day -= 28; $month++; } + } + if ($month > 12) { $month -= 12; $year++; } + return sprintf("%04d-%02d-%02dT%02d:%02d:%02d", $year,$month,$day,$hour,$minute,$second); + } else { + return ""; + } +} + +sub leap_year { + local($this,$year) = @_; + + return 0 if $year % 4 != 0; + return 1 if $year % 400 == 0; + return 0 if $year % 100 == 0; + return 1; +} + +sub print_html_banner { + local($this,$text,$bgcolor,*OUT,$control) = @_; + + $control = "" unless defined($control); + $bgcolor = "#BBCCFF" unless defined($bgcolor); + print OUT "<table width=\"100%\" border=\"0\" cellpadding=\"3\" cellspacing=\"0\"><tr bgcolor=\"$bgcolor\"><td>"; + print OUT "  " unless $text =~ /^\s*<(table|nobr)/; + print OUT $text; + print OUT "</td></tr></table>\n"; + print OUT "<br />\n" unless $control =~ /nobr/i; +} + +sub print_html_head { + local($this, $title, *OUT, $control, $onload_fc, $add_javascript) = @_; + + $control = "" unless defined($control); + $onload_fc = "" unless defined($onload_fc); + $onload_clause = ($onload_fc) ? " onload=\"$onload_fc\"" : ""; + $add_javascript = "" unless defined($add_javascript); + $max_age_clause = ""; + $max_age_clause = "<meta http-equiv=\"cache-control\" content=\"max-age=3600\" \/>"; # if $control =~ /\bexp1hour\b/; + $css_clause = ""; + $css_clause = "\n <link rel=\"stylesheet\" type=\"text/css\" href=\"https://www.isi.edu/~ulf/css/handheld/default.css\" media=\"handheld\"\/>" if $control =~ /css/; + $css_clause .= "\n <link rel=\"stylesheet\" type=\"text/css\" href=\"https://www.isi.edu/~ulf/css/handheld/default.css\" media=\"only screen and (max-device-width:480px)\"\/>" if $control =~ /css/; + $css_clause = "\n <link rel=\"stylesheet\" type=\"text/css\" href=\"https://www.isi.edu/~ulf/css/handheld/default.css\">" if $control =~ /css-handheld/; + $icon_clause = ""; + $icon_clause .= "\n <link rel=\"shortcut icon\" href=\"https://www.isi.edu/~ulf/amr/images/AMR-favicon.ico\">" if $control =~ /\bAMR\b/i; + $icon_clause .= "\n <link rel=\"shortcut icon\" href=\"https://www.isi.edu/~ulf/croom/images/CRE-favicon.ico\">" if $control =~ /\bCRE\b/i; + print OUT "\xEF\xBB\xBF\n" unless $control =~ /\bno-bom\b/; # utf8 marker byte order mark + print OUT<<END_OF_HEADER1; +<html> + <head> + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> + $max_age_clause + <title>$title$css_clause$icon_clause +END_OF_HEADER1 +; + + unless ($control =~ /no javascript/) { + print OUT< + + +END_OF_HEADER2 +; + } + + print OUT< + +END_OF_HEADER3 +; +} + + +sub print_html_foot { + local($this, *OUT) = @_; + + print OUT " \n"; + print OUT "\n"; +} + +sub print_html_page { + local($this, *OUT, $s) = @_; + + print OUT "\xEF\xBB\xBF\n"; + print OUT "\n"; + print OUT " \n"; + print OUT " DEBUG\n"; + print OUT " \n"; + print OUT " \n"; + print OUT " \n"; + print OUT " \n"; + print OUT " $s\n"; + print OUT " \n"; + print OUT "\n"; +} + +sub http_catfile { + local($this, @path) = @_; + + $result = File::Spec->catfile(@path); + $result =~ s/(https?):\/([a-zA-Z])/$1:\/\/$2/; + return $result; +} + +sub underscore_to_space { + local($this, $s) = @_; + + return "" unless defined($s); + + $s =~ s/_+/ /g; + return $s; +} + +sub space_to_underscore { + local($this, $s) = @_; + + return "" unless defined($s); + + $s =~ s/ /_/g; + return $s; +} + +sub remove_spaces { + local($this, $s) = @_; + + $s =~ s/\s//g; + return $s; +} + +sub is_punctuation_string_p { + local($this, $s) = @_; + + return "" unless $s; + $s = $this->normalize_string($s) if $s =~ /[\x80-\xBF]/; + return $s =~ /^[-_,;:.?!\/\@+*"()]+$/; +} + +sub is_rare_punctuation_string_p { + local($this, $s) = @_; + + return 0 unless $s =~ /^[\x21-\x2F\x3A\x40\x5B-\x60\x7B-\x7E]{2,}$/; + return 0 if $s =~ /^(\.{2,3}|-{2,3}|\*{2,3}|::|\@?[-\/:]\@?)$/; + return 1; +} + +sub simplify_punctuation { + local($this, $s) = @_; + + $s =~ s/\xE2\x80\x92/-/g; + $s =~ s/\xE2\x80\x93/-/g; + $s =~ s/\xE2\x80\x94/-/g; + $s =~ s/\xE2\x80\x95/-/g; + $s =~ s/\xE2\x80\x98/`/g; + $s =~ s/\xE2\x80\x99/'/g; + $s =~ s/\xE2\x80\x9A/`/g; + $s =~ s/\xE2\x80\x9C/"/g; + $s =~ s/\xE2\x80\x9D/"/g; + $s =~ s/\xE2\x80\x9E/"/g; + $s =~ s/\xE2\x80\x9F/"/g; + $s =~ s/\xE2\x80\xA2/*/g; + $s =~ s/\xE2\x80\xA4/./g; + $s =~ s/\xE2\x80\xA5/../g; + $s =~ s/\xE2\x80\xA6/.../g; + return $s; +} + +sub latin_plus_p { + local($this, $s, $control) = @_; + + $control = "" unless defined($control); + return $s =~ /^([\x20-\x7E]|\xC2[\xA1-\xBF]|[\xC3-\xCC][\x80-\xBF]|\xCA[\x80-\xAF]|\xE2[\x80-\xAF][\x80-\xBF])+$/; +} + +sub nth_line_in_file { + local($this, $filename, $n) = @_; + + return "" unless $n =~ /^[1-9]\d*$/; + open(IN, $filename) || return ""; + my $line_no = 0; + while () { + $line_no++; + if ($n == $line_no) { + $_ =~ s/\s+$//; + close(IN); + return $_; + } + } + close(IN); + return ""; +} + +sub read_file { + local($this, $filename) = @_; + + my $file_content = ""; + open(IN, $filename) || return ""; + while () { + $file_content .= $_; + } + close(IN); + return $file_content; +} + +sub cap_list { + local($this, @list) = @_; + + @cap_list = (); + foreach $l (@list) { + ($premod, $core) = ($l =~ /^(a|an) (\S.*)$/); + if (defined($premod) && defined($core)) { + push(@cap_list, "$premod \u$core"); + } elsif ($this->cap_member($l, "US")) { + push(@cap_list, uc $l); + } else { + push(@cap_list, "\u$l"); + } + } + return @cap_list; +} + +sub integer_list_with_commas_and_ranges { + local($this, @list) = @_; + + my $in_range_p = 0; + my $last_value = 0; + my $result = ""; + while (@list) { + $elem = shift @list; + if ($elem =~ /^\d+$/) { + if ($in_range_p) { + if ($elem == $last_value + 1) { + $last_value = $elem; + } else { + $result .= "-$last_value, $elem"; + if (@list && ($next = $list[0]) && ($elem =~ /^\d+$/) && ($next =~ /^\d+$/) + && ($next == $elem + 1)) { + $last_value = $elem; + $in_range_p = 1; + } else { + $in_range_p = 0; + } + } + } else { + $result .= ", $elem"; + if (@list && ($next = $list[0]) && ($elem =~ /^\d+$/) && ($next =~ /^\d+$/) + && ($next == $elem + 1)) { + $last_value = $elem; + $in_range_p = 1; + } + } + } else { + if ($in_range_p) { + $result .= "-$last_value, $elem"; + $in_range_p = 0; + } else { + $result .= ", $elem"; + } + } + } + if ($in_range_p) { + $result .= "-$last_value"; + } + $result =~ s/^,\s*//; + return $result; +} + +sub comma_append { + local($this, $a, $b) = @_; + + if (defined($a) && ($a =~ /\S/)) { + if (defined($b) && ($b =~ /\S/)) { + return "$a,$b"; + } else { + return $a; + } + } else { + if (defined($b) && ($b =~ /\S/)) { + return $b; + } else { + return ""; + } + } +} + +sub version { + return "3.17"; +} + +sub print_stderr { + local($this, $message, $verbose) = @_; + + $verbose = 1 unless defined($verbose); + print STDERR $message if $verbose; + return 1; +} + +sub compare_alignment { + local($this, $a, $b, $delimiter) = @_; + + $delimiter = "-" unless $delimiter; + my @a_list = split($delimiter, $a); + my @b_list = split($delimiter, $b); + + while (@a_list && @b_list) { + $a_head = shift @a_list; + $b_head = shift @b_list; + next if $a_head eq $b_head; + return $a_head <=> $b_head if ($a_head =~ /^\d+$/) && ($b_head =~ /^\d+$/); + return $a_head cmp $b_head; + } + return -1 if @a_list; + return 1 if @b_list; + return 0; +} + +sub normalize_string { + # normalize punctuation, full-width characters (to ASCII) + local($this, $s, $control) = @_; + + $control = "" unless defined($control); + + $norm_s = $s; + $norm_s =~ tr/A-Z/a-z/; + + $norm_s =~ s/ \@([-:\/])/ $1/g; # non-initial left @ + $norm_s =~ s/^\@([-:\/])/$1/; # initial left @ + $norm_s =~ s/([-:\/])\@ /$1 /g; # non-initial right @ + $norm_s =~ s/([-:\/])\@$/$1/; # initial right @ + $norm_s =~ s/([\(\)"])([,;.?!])/$1 $2/g; + $norm_s =~ s/\bcannot\b/can not/g; + + $norm_s =~ s/\xC2\xAD/-/g; # soft hyphen + + $norm_s =~ s/\xE2\x80\x94/-/g; # em dash + $norm_s =~ s/\xE2\x80\x95/-/g; # horizontal bar + $norm_s =~ s/\xE2\x80\x98/`/g; # grave accent + $norm_s =~ s/\xE2\x80\x99/'/g; # apostrophe + $norm_s =~ s/\xE2\x80\x9C/"/g; # left double quote mark + $norm_s =~ s/\xE2\x80\x9D/"/g; # right double quote mark + $norm_s =~ s/\xE2\x94\x80/-/g; # box drawings light horizontal + $norm_s =~ s/\xE2\x94\x81/-/g; # box drawings heavy horizontal + $norm_s =~ s/\xE3\x80\x81/,/g; # ideographic comma + $norm_s =~ s/\xE3\x80\x82/./g; # ideographic full stop + $norm_s =~ s/\xE3\x80\x88/"/g; # left angle bracket + $norm_s =~ s/\xE3\x80\x89/"/g; # right angle bracket + $norm_s =~ s/\xE3\x80\x8A/"/g; # left double angle bracket + $norm_s =~ s/\xE3\x80\x8B/"/g; # right double angle bracket + $norm_s =~ s/\xE3\x80\x8C/"/g; # left corner bracket + $norm_s =~ s/\xE3\x80\x8D/"/g; # right corner bracket + $norm_s =~ s/\xE3\x80\x8E/"/g; # left white corner bracket + $norm_s =~ s/\xE3\x80\x8F/"/g; # right white corner bracket + $norm_s =~ s/\xE3\x83\xBB/\xC2\xB7/g; # katakana middle dot -> middle dot + $norm_s =~ s/\xEF\xBB\xBF//g; # UTF8 marker + + if ($control =~ /\bzh\b/i) { + # de-tokenize Chinese + unless ($control =~ /\bpreserve-tok\b/) { + while ($norm_s =~ /[\xE0-\xEF][\x80-\xBF][\x80-\xBF] [\xE0-\xEF][\x80-\xBF][\x80-\xBF]/) { + $norm_s =~ s/([\xE0-\xEF][\x80-\xBF][\x80-\xBF]) ([\xE0-\xEF][\x80-\xBF][\x80-\xBF])/$1$2/g; + } + $norm_s =~ s/([\xE0-\xEF][\x80-\xBF][\x80-\xBF]) ([\x21-\x7E])/$1$2/g; + $norm_s =~ s/([\x21-\x7E]) ([\xE0-\xEF][\x80-\xBF][\x80-\xBF])/$1$2/g; + } + + # fullwidth characters + while ($norm_s =~ /\xEF\xBC[\x81-\xBF]/) { + ($pre,$fullwidth,$post) = ($norm_s =~ /^(.*)(\xEF\xBC[\x81-\xBF])(.*)$/); + $fullwidth =~ s/^\xEF\xBC//; + $fullwidth =~ tr/[\x81-\xBF]/[\x21-\x5F]/; + $norm_s = "$pre$fullwidth$post"; + } + while ($norm_s =~ /\xEF\xBD[\x80-\x9E]/) { + ($pre,$fullwidth,$post) = ($norm_s =~ /^(.*)(\xEF\xBD[\x80-\x9E])(.*)$/); + $fullwidth =~ s/^\xEF\xBD//; + $fullwidth =~ tr/[\x80-\x9E]/[\x60-\x7E]/; + $norm_s = "$pre$fullwidth$post"; + } + $norm_s =~ tr/A-Z/a-z/ unless $control =~ /\bpreserve-case\b/; + + unless ($control =~ /\bpreserve-tok\b/) { + while ($norm_s =~ /[\x21-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E] [\x21-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E]/) { + $norm_s =~ s/([\x21-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E]) ([\x21-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E])/$1$2/g; + } + $norm_s =~ s/([\x21-\x7E]) ([\x21-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E])/$1$2/g; + $norm_s =~ s/([\x21-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E]) ([\x21-\x7E])/$1$2/g; + $norm_s =~ s/ (\xC2\xA9|\xC2\xB7|\xC3\x97) /$1/g; # copyright sign, middle dot, multiplication sign + } + } + + if (($control =~ /\bzh\b/i) && ($control =~ /\bnorm-char\b/)) { + $norm_s =~ s/\xE6\x96\xBC/\xE4\xBA\x8E/g; # feng1 (first char. of Chin. "lie low", line 1308) + $norm_s =~ s/\xE6\xAD\xA7/\xE5\xB2\x90/g; # qi2 (second char. of Chin. "difference", line 1623) + $norm_s =~ s/\xE8\x82\xB2/\xE6\xAF\x93/g; # yu4 (second char. of Chin. "sports", line 440) + $norm_s =~ s/\xE8\x91\x97/\xE7\x9D\x80/g; # zhao (second char. of Chin. "prominent", line 4) + $norm_s =~ s/\xE9\x81\x87/\xE8\xBF\x82/g; # yu4 (second char. of Chin. "good luck", line 959) + } + + if ($control =~ /\bspurious-punct\b/) { + $norm_s =~ s/^\s*[-_\." ]+//; + $norm_s =~ s/[-_\." ]+\s*$//; + $norm_s =~ s/\(\s+end\s+\)\s*$//i; + $norm_s =~ s/^\s*null\s*$//i; + } + + $norm_s =~ s/^\s+//; + $norm_s =~ s/\s+$//; + $norm_s =~ s/\s+/ /g; + + return $norm_s; +} + +sub normalize_extreme_string { + local($this, $s, $control) = @_; + + $control = "" unless defined($control); + + $norm_s = $s; + $norm_s =~ s/\xE2\xA9\xBE/\xE2\x89\xA5/g; # slanted greater than or equal to + + return $norm_s; +} + +sub increase_ht_count { + local($this, *ht, $incr, @path) = @_; + + if ($#path == 0) { + $ht{($path[0])} = ($ht{($path[0])} || 0) + $incr; + } elsif ($#path == 1) { + $ht{($path[0])}->{($path[1])} + = ($ht{($path[0])}->{($path[1])} || 0) + $incr; + } elsif ($#path == 2) { + $ht{($path[0])}->{($path[1])}->{($path[2])} + = ($ht{($path[0])}->{($path[1])}->{($path[2])} || 0) + $incr; + } elsif ($#path == 3) { + $ht{($path[0])}->{($path[1])}->{($path[2])}->{($path[3])} + = ($ht{($path[0])}->{($path[1])}->{($path[2])}->{($path[3])} || 0) + $incr; + } elsif ($#path == 4) { + $ht{($path[0])}->{($path[1])}->{($path[2])}->{($path[3])}->{($path[4])} + = ($ht{($path[0])}->{($path[1])}->{($path[2])}->{($path[3])}->{($path[4])} || 0) + $incr; + } else { + print STDERR "increase_ht_count unsupported for path of length " . ($#path + 1) . "\n"; + } +} + +sub adjust_numbers { + # non-negative integers + local($this, $s, $delta) = @_; + + $result = ""; + while ($s =~ /\d/) { + ($pre,$i,$post) = ($s =~ /^([^0-9]*)(\d+)([^0-9].*|)$/); + $result .= $pre . ($i + $delta); + $s = $post; + } + $result .= $s; + return $result; +} + +sub first_defined { + local($this, @list) = @_; + + foreach $elem (@list) { + return $elem if defined($elem); + } + return ""; +} + +sub first_defined_non_empty { + local($this, @list) = @_; + + foreach $item (@list) { + return $item if defined($item) && ($item ne ""); + } + return ""; +} + +sub elem_after_member_list { + local($this,$elem,@array) = @_; + + my @elem_after_member_list = (); + foreach $i ((0 .. ($#array - 1))) { + push(@elem_after_member_list, $array[$i+1]) if $elem eq $array[$i]; + } + return join(" ", @elem_after_member_list); +} + +sub add_value_to_list { + local($this,$s,$value,$sep) = @_; + + $s = "" unless defined($s); + $sep = "," unless defined($sep); + return ($s =~ /\S/) ? "$s$sep$value" : $value; +} + +sub add_new_value_to_list { + local($this,$s,$value,$sep) = @_; + + $s = "" unless defined($s); + $sep = "," unless defined($sep); + my @values = split(/$sep/, $s); + push(@values, $value) if defined($value) && ! $this->member($value, @values); + + return join($sep, @values); +} + +sub add_new_hash_value_to_list { + local($this,*ht,$key,$value,$sep) = @_; + + $sep = "," unless defined($sep); + my $value_s = $ht{$key}; + if (defined($value_s)) { + my @values = split(/$sep/, $value_s); + push(@values, $value) unless $this->member($value, @values); + $ht{$key} = join($sep, @values); + } else { + $ht{$key} = $value; + } +} + +sub ip_info { + local($this, $ip_address) = @_; + + my %ip_map = (); + $ip_map{"128.9.208.69"} = "Ulf Hermjakob (bach.isi.edu)"; + $ip_map{"128.9.208.169"} = "Ulf Hermjakob (brahms.isi.edu)"; + $ip_map{"128.9.184.148"} = "Ulf Hermjakob (beethoven.isi.edu ?)"; + $ip_map{"128.9.184.162"} = "Ulf Hermjakob (beethoven.isi.edu)"; + $ip_map{"128.9.176.39"} = "Kevin Knight"; + $ip_map{"128.9.184.187"} = "Kevin Knight"; + $ip_map{"128.9.216.56"} = "Kevin Knight"; + $ip_map{"128.9.208.155"} = "cage.isi.edu"; + + return ($ip_name = $ip_map{$ip_address}) ? "$ip_address - $ip_name" : $ip_address; +} + +# from standalone de-accent.pl +sub de_accent_string { + local($this, $s) = @_; + + $s =~ tr/A-Z/a-z/; + unless (0) { + # Latin-1 + if ($s =~ /\xC3[\x80-\xBF]/) { + $s =~ s/(À|Á|Â|Ã|Ä|Å)/A/g; + $s =~ s/Æ/Ae/g; + $s =~ s/Ç/C/g; + $s =~ s/Ð/D/g; + $s =~ s/(È|É|Ê|Ë)/E/g; + $s =~ s/(Ì|Í|Î|Ï)/I/g; + $s =~ s/Ñ/N/g; + $s =~ s/(Ò|Ó|Ô|Õ|Ö|Ø)/O/g; + $s =~ s/(Ù|Ú|Û|Ü)/U/g; + $s =~ s/Þ/Th/g; + $s =~ s/Ý/Y/g; + $s =~ s/(à|á|â|ã|ä|å)/a/g; + $s =~ s/æ/ae/g; + $s =~ s/ç/c/g; + $s =~ s/(è|é|ê|ë)/e/g; + $s =~ s/(ì|í|î|ï)/i/g; + $s =~ s/ð/d/g; + $s =~ s/ñ/n/g; + $s =~ s/(ò|ó|ô|õ|ö)/o/g; + $s =~ s/ß/ss/g; + $s =~ s/þ/th/g; + $s =~ s/(ù|ú|û|ü)/u/g; + $s =~ s/(ý|ÿ)/y/g; + } + # Latin Extended-A + if ($s =~ /[\xC4-\xC5][\x80-\xBF]/) { + $s =~ s/(Ā|Ă|Ą)/A/g; + $s =~ s/(ā|ă|ą)/a/g; + $s =~ s/(Ć|Ĉ|Ċ|Č)/C/g; + $s =~ s/(ć|ĉ|ċ|č)/c/g; + $s =~ s/(Ď|Đ)/D/g; + $s =~ s/(ď|đ)/d/g; + $s =~ s/(Ē|Ĕ|Ė|Ę|Ě)/E/g; + $s =~ s/(ē|ĕ|ė|ę|ě)/e/g; + $s =~ s/(Ĝ|Ğ|Ġ|Ģ)/G/g; + $s =~ s/(ĝ|ğ|ġ|ģ)/g/g; + $s =~ s/(Ĥ|Ħ)/H/g; + $s =~ s/(ĥ|ħ)/h/g; + $s =~ s/(Ĩ|Ī|Ĭ|Į|İ)/I/g; + $s =~ s/(ĩ|ī|ĭ|į|ı)/i/g; + $s =~ s/IJ/Ij/g; + $s =~ s/ij/ij/g; + $s =~ s/Ĵ/J/g; + $s =~ s/ĵ/j/g; + $s =~ s/Ķ/K/g; + $s =~ s/(ķ|ĸ)/k/g; + $s =~ s/(Ĺ|Ļ|Ľ|Ŀ|Ł)/L/g; + $s =~ s/(ļ|ľ|ŀ|ł)/l/g; + $s =~ s/(Ń|Ņ|Ň|Ŋ)/N/g; + $s =~ s/(ń|ņ|ň|ʼn|ŋ)/n/g; + $s =~ s/(Ō|Ŏ|Ő)/O/g; + $s =~ s/(ō|ŏ|ő)/o/g; + $s =~ s/Œ/Oe/g; + $s =~ s/œ/oe/g; + $s =~ s/(Ŕ|Ŗ|Ř)/R/g; + $s =~ s/(ŕ|ŗ|ř)/r/g; + $s =~ s/(Ś|Ŝ|Ş|Š)/S/g; + $s =~ s/(ś|ŝ|ş|š|ſ)/s/g; + $s =~ s/(Ţ|Ť|Ŧ)/T/g; + $s =~ s/(ţ|ť|ŧ)/t/g; + $s =~ s/(Ũ|Ū|Ŭ|Ů|Ű|Ų)/U/g; + $s =~ s/(ũ|ū|ŭ|ů|ű|ų)/u/g; + $s =~ s/Ŵ/W/g; + $s =~ s/ŵ/w/g; + $s =~ s/(Ŷ|Ÿ)/Y/g; + $s =~ s/ŷ/y/g; + $s =~ s/(Ź|Ż|Ž)/Z/g; + $s =~ s/(ź|ż|ž)/z/g; + } + # Latin Extended-B + if ($s =~ /[\xC7-\xC7][\x80-\xBF]/) { + $s =~ s/(\xC7\x8D)/A/g; + $s =~ s/(\xC7\x8E)/a/g; + $s =~ s/(\xC7\x8F)/I/g; + $s =~ s/(\xC7\x90)/i/g; + $s =~ s/(\xC7\x91)/O/g; + $s =~ s/(\xC7\x92)/o/g; + $s =~ s/(\xC7\x93)/U/g; + $s =~ s/(\xC7\x94)/u/g; + $s =~ s/(\xC7\x95)/U/g; + $s =~ s/(\xC7\x96)/u/g; + $s =~ s/(\xC7\x97)/U/g; + $s =~ s/(\xC7\x98)/u/g; + $s =~ s/(\xC7\x99)/U/g; + $s =~ s/(\xC7\x9A)/u/g; + $s =~ s/(\xC7\x9B)/U/g; + $s =~ s/(\xC7\x9C)/u/g; + } + # Latin Extended Additional + if ($s =~ /\xE1[\xB8-\xBF][\x80-\xBF]/) { + $s =~ s/(ḁ|ạ|ả|ấ|ầ|ẩ|ẫ|ậ|ắ|ằ|ẳ|ẵ|ặ|ẚ)/a/g; + $s =~ s/(ḃ|ḅ|ḇ)/b/g; + $s =~ s/(ḉ)/c/g; + $s =~ s/(ḋ|ḍ|ḏ|ḑ|ḓ)/d/g; + $s =~ s/(ḕ|ḗ|ḙ|ḛ|ḝ|ẹ|ẻ|ẽ|ế|ề|ể|ễ|ệ)/e/g; + $s =~ s/(ḟ)/f/g; + $s =~ s/(ḡ)/g/g; + $s =~ s/(ḣ|ḥ|ḧ|ḩ|ḫ)/h/g; + $s =~ s/(ḭ|ḯ|ỉ|ị)/i/g; + $s =~ s/(ḱ|ḳ|ḵ)/k/g; + $s =~ s/(ḷ|ḹ|ḻ|ḽ)/l/g; + $s =~ s/(ḿ|ṁ|ṃ)/m/g; + $s =~ s/(ṅ|ṇ|ṉ|ṋ)/m/g; + $s =~ s/(ọ|ỏ|ố|ồ|ổ|ỗ|ộ|ớ|ờ|ở|ỡ|ợ|ṍ|ṏ|ṑ|ṓ)/o/g; + $s =~ s/(ṕ|ṗ)/p/g; + $s =~ s/(ṙ|ṛ|ṝ|ṟ)/r/g; + $s =~ s/(ṡ|ṣ|ṥ|ṧ|ṩ|ẛ)/s/g; + $s =~ s/(ṫ|ṭ|ṯ|ṱ)/t/g; + $s =~ s/(ṳ|ṵ|ṷ|ṹ|ṻ|ụ|ủ|ứ|ừ|ử|ữ|ự)/u/g; + $s =~ s/(ṽ|ṿ)/v/g; + $s =~ s/(ẁ|ẃ|ẅ|ẇ|ẉ|ẘ)/w/g; + $s =~ s/(ẋ|ẍ)/x/g; + $s =~ s/(ẏ|ỳ|ỵ|ỷ|ỹ|ẙ)/y/g; + $s =~ s/(ẑ|ẓ|ẕ)/z/g; + $s =~ s/(Ḁ|Ạ|Ả|Ấ|Ầ|Ẩ|Ẫ|Ậ|Ắ|Ằ|Ẳ|Ẵ|Ặ)/A/g; + $s =~ s/(Ḃ|Ḅ|Ḇ)/B/g; + $s =~ s/(Ḉ)/C/g; + $s =~ s/(Ḋ|Ḍ|Ḏ|Ḑ|Ḓ)/D/g; + $s =~ s/(Ḕ|Ḗ|Ḙ|Ḛ|Ḝ|Ẹ|Ẻ|Ẽ|Ế|Ề|Ể|Ễ|Ệ)/E/g; + $s =~ s/(Ḟ)/F/g; + $s =~ s/(Ḡ)/G/g; + $s =~ s/(Ḣ|Ḥ|Ḧ|Ḩ|Ḫ)/H/g; + $s =~ s/(Ḭ|Ḯ|Ỉ|Ị)/I/g; + $s =~ s/(Ḱ|Ḳ|Ḵ)/K/g; + $s =~ s/(Ḷ|Ḹ|Ḻ|Ḽ)/L/g; + $s =~ s/(Ḿ|Ṁ|Ṃ)/M/g; + $s =~ s/(Ṅ|Ṇ|Ṉ|Ṋ)/N/g; + $s =~ s/(Ṍ|Ṏ|Ṑ|Ṓ|Ọ|Ỏ|Ố|Ồ|Ổ|Ỗ|Ộ|Ớ|Ờ|Ở|Ỡ|Ợ)/O/g; + $s =~ s/(Ṕ|Ṗ)/P/g; + $s =~ s/(Ṙ|Ṛ|Ṝ|Ṟ)/R/g; + $s =~ s/(Ṡ|Ṣ|Ṥ|Ṧ|Ṩ)/S/g; + $s =~ s/(Ṫ|Ṭ|Ṯ|Ṱ)/T/g; + $s =~ s/(Ṳ|Ṵ|Ṷ|Ṹ|Ṻ|Ụ|Ủ|Ứ|Ừ|Ử|Ữ|Ự)/U/g; + $s =~ s/(Ṽ|Ṿ)/V/g; + $s =~ s/(Ẁ|Ẃ|Ẅ|Ẇ|Ẉ)/W/g; + $s =~ s/(Ẍ)/X/g; + $s =~ s/(Ẏ|Ỳ|Ỵ|Ỷ|Ỹ)/Y/g; + $s =~ s/(Ẑ|Ẓ|Ẕ)/Z/g; + } + # Greek letters + if ($s =~ /\xCE[\x86-\xAB]/) { + $s =~ s/ά/α/g; + $s =~ s/έ/ε/g; + $s =~ s/ί/ι/g; + $s =~ s/ϊ/ι/g; + $s =~ s/ΐ/ι/g; + $s =~ s/ό/ο/g; + $s =~ s/ύ/υ/g; + $s =~ s/ϋ/υ/g; + $s =~ s/ΰ/υ/g; + $s =~ s/ώ/ω/g; + $s =~ s/Ά/Α/g; + $s =~ s/Έ/Ε/g; + $s =~ s/Ή/Η/g; + $s =~ s/Ί/Ι/g; + $s =~ s/Ϊ/Ι/g; + $s =~ s/Ύ/Υ/g; + $s =~ s/Ϋ/Υ/g; + $s =~ s/Ώ/Ω/g; + } + # Cyrillic letters + if ($s =~ /\xD0[\x80-\xAF]/) { + $s =~ s/Ѐ/Е/g; + $s =~ s/Ё/Е/g; + $s =~ s/Ѓ/Г/g; + $s =~ s/Ќ/К/g; + $s =~ s/Ѝ/И/g; + $s =~ s/Й/И/g; + $s =~ s/ѐ/е/g; + $s =~ s/ё/е/g; + $s =~ s/ѓ/г/g; + $s =~ s/ќ/к/g; + $s =~ s/ѝ/и/g; + $s =~ s/й/и/g; + } + } + return $s; +} + +sub read_de_accent_case_resource { + local($this, $filename, *ht, *LOG) = @_; + # e.g. data/char-de-accent-lc.txt + + if (open(IN, $filename)) { + my $mode = "de-accent"; + my $line_number = 0; + my $n_de_accent_targets = 0; + my $n_de_accent_sources = 0; + my $n_case_entries = 0; + while () { + s/\s*$//; + $line_number++; + if ($_ =~ /^#+\s*CASE\b/) { + $mode = "case"; + } elsif ($_ =~ /^#+\s*PUNCTUATION NORMALIZATION\b/) { + $mode = "punctuation-normalization"; + } elsif ($_ =~ /^#/) { + # ignore comment + } elsif ($_ =~ /^\s*$/) { + # ignore empty line + } elsif (($mode eq "de-accent") && (($char_without_accent, @chars_with_accent) = split(/\s+/, $_))) { + if (keys %{$ht{DE_ACCENT_INV}->{$char_without_accent}}) { + print LOG "Ignoring duplicate de-accent line for target $char_without_accent in l.$line_number in $filename\n" unless $char_without_accent eq "--"; + } elsif (@chars_with_accent) { + $n_de_accent_targets++; + foreach $char_with_accent (@chars_with_accent) { + my @prev_target_chars = keys %{$ht{DE_ACCENT}->{$char_with_accent}}; + print LOG "Accent character $char_with_accent has duplicate target $char_without_accent (besides @prev_target_chars) in l.$line_number in $filename\n" if @prev_target_chars && (! ($char_without_accent =~ /^[aou]e$/i)); + $char_without_accent = "" if $char_without_accent eq "--"; + $ht{DE_ACCENT}->{$char_with_accent}->{$char_without_accent} = 1; + $ht{DE_ACCENT_INV}->{$char_without_accent}->{$char_with_accent} = 1; + $n_de_accent_sources++; + } + } else { + print LOG "Empty de-accent list for $char_without_accent in l.$line_number in $filename\n"; + } + } elsif (($mode eq "punctuation-normalization") && (($norm_punct, @unnorm_puncts) = split(/\s+/, $_))) { + if (keys %{$ht{NORM_PUNCT_INV}->{$norm_punct}}) { + print LOG "Ignoring duplicate punctuation-normalization line for target $norm_punct in l.$line_number in $filename\n"; + } elsif (@unnorm_puncts) { + foreach $unnorm_punct (@unnorm_puncts) { + my $prev_norm_punct = $ht{NORM_PUNCT}->{$unnorm_punct}; + if ($prev_norm_punct) { + print LOG "Ignoring duplicate punctuation normalization $unnorm_punct -> $norm_punct (besides $prev_norm_punct) in l.$line_number in $filename\n"; + } + $ht{NORM_PUNCT}->{$unnorm_punct} = $norm_punct; + $ht{NORM_PUNCT_INV}->{$norm_punct}->{$unnorm_punct} = 1; + } + } + } elsif (($mode eq "case") && (($uc_char, $lc_char) = ($_ =~ /^(\S+)\s+(\S+)\s*$/))) { + $ht{UPPER_TO_LOWER_CASE}->{$uc_char} = $lc_char; + $n_case_entries++; + } else { + print LOG "Unrecognized l.$line_number in $filename\n"; + } + } + close(IN); + print LOG "Found $n_case_entries case entries, $n_de_accent_sources/$n_de_accent_targets source/target entries in $line_number lines in file $filename\n"; + } else { + print LOG "Can't open $filename\n"; + } +} + +sub de_accent_char { + local($this, $char, *ht, $default) = @_; + + @de_accend_char_results = sort keys %{$ht{DE_ACCENT}->{$char}}; + return (@de_accend_char_results) ? @de_accend_char_results : ($default); +} + +sub lower_case_char { + local($this, $char, *ht, $default) = @_; + + return $ht{UPPER_TO_LOWER_CASE}->{$char} || $default; +} + +sub lower_case_and_de_accent_char { + local($this, $char, *ht) = @_; + + my $lc_char = $this->lower_case_char($char, *ht, $char); + return $this->de_accent_char($lc_char, *ht, $lc_char); +} + +sub lower_case_and_de_accent_string { + local($this, $string, *ht, $control) = @_; + + my $norm_punct_p = ($control && ($control =~ /norm-punct/i)); + my @chars = $utf8->split_into_utf8_characters($string, "return only chars; XML chars; return trailing whitespaces", *ht); + my $result = ""; + foreach $char (@chars) { + my @lc_de_accented_chars = $this->lower_case_and_de_accent_char($char, *ht); + if ($norm_punct_p + && (! @lc_de_accented_chars)) { + my $norm_punct = $ht{NORM_PUNCT}->{$char}; + @lc_de_accented_chars = ($norm_punct) if $norm_punct; + } + $result .= ((@lc_de_accented_chars) ? $lc_de_accented_chars[0] : $char); + } + return $result; +} + +sub round_to_n_decimal_places { + local($this, $x, $n, $fill_decimals_p) = @_; + + $fill_decimals_p = 0 unless defined($fill_decimals_p); + return $x unless defined($x); + return $x if ($x =~ /^-?\d+$/) && (! $fill_decimals_p); + $factor = 1; + foreach $i ((1 .. $n)) { + $factor *= 10; + } + my $rounded_number; + if ($x > 0) { + $rounded_number = (int(($factor * $x) + 0.5) / $factor); + } else { + $rounded_number = (int(($factor * $x) - 0.5) / $factor); + } + if ($fill_decimals_p) { + ($period, $decimals) = ($rounded_number =~ /^-?\d+(\.?)(\d*)$/); + $rounded_number .= "." unless $period || ($n == 0); + foreach ((1 .. ($n - length($decimals)))) { + $rounded_number .= 0; + } + } + return $rounded_number; +} + +sub commify { + local($caller,$number) = @_; + + my $text = reverse $number; + $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; + return scalar reverse $text; +} + +sub add_javascript_functions { + local($caller,@function_names) = @_; + + $add_javascript_function_s = ""; + foreach $function_name (@function_names) { + + if ($function_name eq "highlight_elems") { + $add_javascript_function_s .= " + function highlight_elems(group_id, value) { + if (group_id != '') { + i = 1; + id = group_id + '-' + i; + while ((s = document.getElementById(id)) != null) { + if (! s.origColor) { + if (s.style.color) { + s.origColor = s.style.color; + } else { + s.origColor = '#000000'; + } + } + if (value == '1') { + s.style.color = '#0000FF'; + if (s.innerHTML == '-') { + s.style.innerHtml = s.innerHTML; + s.innerHTML = '-   ← here'; + s.style.fontWeight = 900; + } else { + s.style.fontWeight = 'bold'; + } + } else { + s.style.fontWeight = 'normal'; + s.style.color = s.origColor; + if (s.style.innerHtml != null) { + s.innerHTML = s.style.innerHtml; + } + } + i = i + 1; + id = group_id + '-' + i; + } + } + } +"; + } elsif ($function_name eq "set_style_for_ids") { + $add_javascript_function_s .= " + function set_style_for_ids(style,id_list) { + var ids = id_list.split(/\\s+/); + var len = ids.length; + var s; + for (var i=0; i\\n postedit\\n'); + tmp.write(' \\n'); + tmp.write('
\\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write('
\\n'); + tmp.write(' \\n'); + tmp.write('\\n'); + tmp.close(); + } + } +"; + } elsif ($function_name eq "get_logsid") { + $add_javascript_function_s .= " + function get_logsid () { + var cookies = document.cookie.split(\";\"); + for (var i=0;i\\n'); + tmp.write('

Update AMR Workset Status

\\n'); + tmp.write('
\\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write('
Workset:
Annotator:
New status:
Comment:
(optional)
Notify supervisor by email.
\\n'); + tmp.write(' \\n'); + tmp.write('
\\n'); + tmp.write(' \\n'); + tmp.write('\\n'); + tmp.close(); + } +"; + } elsif ($function_name eq "open_amr_issue") { + $add_javascript_function_s .= " + function open_amr_issue (annotator, snt_id, username, button_tr_id) { + if ((s = document.getElementById(button_tr_id)) != null) { + s.style.backgroundColor = \"#7799FF\"; + } + var logsid = get_logsid(); + if ((logsid == '') && ((s2 = document.getElementById('logsid')) != null)) { + logsid = s2.value; + } + newwindow= window.open('','Open issue','height=280,width=700,resizable=1,scrollbars=1,toolbar=1,statusbar=1,menubar=1'); + newwindow.focus(); + var tmp = newwindow.document; + tmp.write('\\n Open issue\\n'); + tmp.write(' \\n'); + tmp.write('

Open AMR Issue

\\n'); + tmp.write('
\\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write('
Open an issue regarding the AMR for snt. ' + snt_id + ' as annotated by ' + annotator + '
Type of issue:
Description of issue:<\/span>
Comment:<\/span>
(optional)
\\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write('
\\n'); + tmp.write(' \\n'); + tmp.write('\\n'); + tmp.close(); + } +"; + } elsif ($function_name eq "close_amr_issue") { + $add_javascript_function_s .= " + function close_amr_issue (annotator, snt_id, issue_id, username) { + var logsid = get_logsid(); + if ((logsid == '') && ((s2 = document.getElementById('logsid')) != null)) { + logsid = s2.value; + } + newwindow= window.open('','Close AMR issue','height=300,width=450,resizable=1,scrollbars=1,toolbar=1,statusbar=1,menubar=1'); + newwindow.focus(); + var tmp = newwindow.document; + tmp.write('\\n Close AMR issue\\n'); + tmp.write(' \\n'); + tmp.write('

Close AMR Issue

\\n'); + tmp.write('
\\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write('
\\n'); + tmp.write(' Closing AMR issue ...\\n'); + tmp.write(' \\n'); + tmp.write('\\n'); + tmp.close(); + if ((s3 = document.getElementById(('close-' + issue_id))) != null) { + s3.innerHTML = 'closed'; + } + } +"; + } elsif ($function_name eq "check_amr") { + $add_javascript_function_s .= " + function amr_check (annotator, workset, source) { + var logsid = get_logsid(); + if ((logsid == '') && ((s2 = document.getElementById('logsid')) != null)) { + logsid = s2.value; + } + var action = 'https://www.isi.edu/cgi-bin/div3/mt/amr-editor/check-amr.cgi'; + var newwindow_title = 'check ' + annotator + ' ' + workset; + var newwindow=window.open('',newwindow_title); + var tmp = newwindow.document; + tmp.write('\\n AMR checker\\n'); + tmp.write(' \\n'); + tmp.write('Checking AMRs ...\\n'); + tmp.write('
\\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write(' \\n'); + tmp.write('
\\n'); + tmp.write(' \\n'); + tmp.write('\\n'); + tmp.close(); + newwindow.focus(); + } +"; + # END EXCLUDE FROM RELEASE UROMAN + } + } + return $add_javascript_function_s; +} + +sub append_to_file { + local($caller, $filename, $s, $mod) = @_; + + my $result = ""; + if (-e $filename) { + if (open(OUT, ">>$filename")) { + print OUT $s; + close(OUT); + $result = "Appended"; + } else { + $result = "Can't append"; + } + } else { + if (open(OUT, ">$filename")) { + print OUT $s; + close(OUT); + $result = "Wrote"; + } else { + $result = "Can't write"; + } + } + chmod($mod, $filename) if defined($mod) && -e $filename; + return $result; +} + +sub square { + local($caller, $x) = @_; + + return $x * $x; +} + +sub mutual_info { + local($caller, $ab_count, $a_count, $b_count, $total_count, $smoothing) = @_; + + $smoothing = 1 unless defined($smoothing); + $ab_count = 0 unless defined($ab_count); + return 0 unless $a_count && $b_count && $total_count; + + my $p_ab = $ab_count / $total_count; + my $p_a = $a_count / $total_count; + my $p_b = $b_count / $total_count; + my $expected_ab = $p_a * $p_b * $total_count; + + return -99 unless $expected_ab || $smoothing; + + return CORE::log(($ab_count + $smoothing) / ($expected_ab + $smoothing)); +} + +sub mutual_info_multi { + local($caller, $multi_count, $total_count, $smoothing, @counts) = @_; + + return 0 unless $total_count; + my $p_indivuals = 1; + foreach $count (@counts) { + return 0 unless $count; + $p_indivuals *= ($count / $total_count); + } + my $expected_multi_count = $p_indivuals * $total_count; + # print STDERR "actual vs. expected multi_count($multi_count, $total_count, $smoothing, @counts) = $multi_count vs. $expected_multi_count\n"; + + return -99 unless $expected_multi_count || $smoothing; + + return CORE::log(($multi_count + $smoothing) / ($expected_multi_count + $smoothing)); +} + +sub precision_recall_fmeasure { + local($caller, $n_gold, $n_test, $n_shared, $pretty_print_p) = @_; + + unless (($n_gold =~ /^[1-9]\d*$/) && ($n_test =~ /^[1-9]\d*$/)) { + $zero = ($pretty_print_p) ? "0%" : 0; + if ($n_gold =~ /^[1-9]\d*$/) { + return ("n/a", $zero, $zero); + } elsif ($n_test =~ /^[1-9]\d*$/) { + return ($zero, "n/a", $zero); + } else { + return ("n/a", "n/a", "n/a"); + } + } + my $precision = $n_shared / $n_test; + my $recall = $n_shared / $n_gold; + my $f_measure = ($precision * $recall * 2) / ($precision + $recall); + + return ($precision, $recall, $f_measure) unless $pretty_print_p; + + my $pretty_precision = $caller->round_to_n_decimal_places(100*$precision, 1) . "%"; + my $pretty_recall = $caller->round_to_n_decimal_places(100*$recall, 1) . "%"; + my $pretty_f_measure = $caller->round_to_n_decimal_places(100*$f_measure, 1) . "%"; + + return ($pretty_precision, $pretty_recall, $pretty_f_measure); +} + +sub recapitalize_named_entity { + local($caller, $s) = @_; + + my @comps = (); + foreach $comp (split(/\s+/, $s)) { + if ($comp =~ /^(and|da|for|of|on|the|van|von)$/) { + push(@comps, $comp); + } elsif ($comp =~ /^[a-z]/) { + push(@comps, ucfirst $comp); + } else { + push(@comps, $comp); + } + } + return join(" ", @comps); +} + +sub slot_value_in_double_colon_del_list { + local($this, $s, $slot, $default) = @_; + + $default = "" unless defined($default); + if (($value) = ($s =~ /::$slot\s+(\S.*\S|\S)\s*$/)) { + $value =~ s/\s*::\S.*\s*$//; + return $value; + } else { + return $default; + } +} + +sub synt_in_double_colon_del_list { + local($this, $s) = @_; + + ($value) = ($s =~ /::synt\s+(\S+|\S.*?\S)(?:\s+::.*)?$/); + return (defined($value)) ? $value : ""; +} + +sub form_in_double_colon_del_list { + local($this, $s) = @_; + + ($value) = ($s =~ /::form\s+(\S+|\S.*?\S)(?:\s+::.*)?$/); + return (defined($value)) ? $value : ""; +} + +sub lex_in_double_colon_del_list { + local($this, $s) = @_; + + ($value) = ($s =~ /::lex\s+(\S+|\S.*?\S)(?:\s+::.*)?$/); + return (defined($value)) ? $value : ""; +} + +sub multi_slot_value_in_double_colon_del_list { + # e.g. when there are multiple slot/value pairs in a line, e.g. ::eng ... :eng ... + local($this, $s, $slot) = @_; + + @values = (); + while (($value, $rest) = ($s =~ /::$slot\s+(\S.*?\S|\S)(\s+::\S.*|\s*)$/)) { + push(@values, $value); + $s = $rest; + } + return @values; +} + +sub remove_slot_in_double_colon_del_list { + local($this, $s, $slot) = @_; + + $s =~ s/::$slot(?:|\s+\S|\s+\S.*?\S)(\s+::\S.*|\s*)$/$1/; + $s =~ s/^\s*//; + return $s; +} + +sub extract_split_info_from_split_dir { + local($this, $dir, *ht) = @_; + + my $n_files = 0; + my $n_snt_ids = 0; + if (opendir(DIR, $dir)) { + my @filenames = sort readdir(DIR); + closedir(DIR); + foreach $filename (@filenames) { + next unless $filename =~ /\.txt$/; + my $split_class; + if (($split_class) = ($filename =~ /-(dev|training|test)-/)) { + my $full_filename = "$dir/$filename"; + if (open(IN, $full_filename)) { + my $old_n_snt_ids = $n_snt_ids; + while () { + if (($snt_id) = ($_ =~ /^#\s*::id\s+(\S+)/)) { + if ($old_split_class = $ht{SPLIT_CLASS}->{$snt_id}) { + unless ($old_split_class eq $split_class) { + print STDERR "Conflicting split class for $snt_id: $old_split_class $split_class\n"; + } + } else { + $ht{SPLIT_CLASS}->{$snt_id} = $split_class; + $ht{SPLIT_CLASS_COUNT}->{$split_class} = ($ht{SPLIT_CLASS_COUNT}->{$split_class} || 0) + 1; + $n_snt_ids++; + } + } + } + $n_files++ unless $n_snt_ids == $old_n_snt_ids; + close(IN); + } else { + print STDERR "Can't open file $full_filename"; + } + } else { + print STDERR "Skipping file $filename when extracting split info from $dir\n"; + } + } + print STDERR "Extracted $n_snt_ids split classes from $n_files files.\n"; + } else { + print STDERR "Can't open directory $dir to extract split info.\n"; + } +} + +sub extract_toks_for_split_class_from_dir { + local($this, $dir, *ht, $split_class, $control) = @_; + + $control = "" unless defined($control); + $print_snt_id_p = ($control =~ /\bwith-snt-id\b/); + my $n_files = 0; + my $n_snts = 0; + if (opendir(DIR, $dir)) { + my @filenames = sort readdir(DIR); + closedir(DIR); + foreach $filename (@filenames) { + next unless $filename =~ /^alignment-release-.*\.txt$/; + my $full_filename = "$dir/$filename"; + if (open(IN, $full_filename)) { + my $old_n_snts = $n_snts; + my $snt_id = ""; + while () { + if (($s_value) = ($_ =~ /^#\s*::id\s+(\S+)/)) { + $snt_id = $s_value; + $proper_split_class_p + = ($this_split_class = $ht{SPLIT_CLASS}->{$snt_id}) + && ($this_split_class eq $split_class); + } elsif (($tok) = ($_ =~ /^#\s*::tok\s+(\S|\S.*\S)\s*$/)) { + if ($proper_split_class_p) { + print "$snt_id " if $print_snt_id_p; + print "$tok\n"; + $n_snts++; + } + } + } + $n_files++ unless $n_snts == $old_n_snts; + close(IN); + } else { + print STDERR "Can't open file $full_filename"; + } + } + print STDERR "Extracted $n_snts tokenized sentences ($split_class) from $n_files files.\n"; + } else { + print STDERR "Can't open directory $dir to extract tokens.\n"; + } +} + +sub load_relevant_tok_ngram_corpus { + local($this, $filename, *ht, $max_lex_rule_span, $ngram_count_min, $optional_ngram_output_filename) = @_; + + $ngram_count_min = 1 unless $ngram_count_min; + $max_lex_rule_span = 10 unless $max_lex_rule_span; + my $n_ngram_instances = 0; + my $n_ngram_types = 0; + if (open(IN, $filename)) { + while () { + s/\s*$//; + @tokens = split(/\s+/, $_); + foreach $from_token_index ((0 .. $#tokens)) { + foreach $to_token_index (($from_token_index .. ($from_token_index + $max_lex_rule_span -1))) { + last if $to_token_index > $#tokens; + my $ngram = join(" ", @tokens[$from_token_index .. $to_token_index]); + $ht{RELEVANT_NGRAM}->{$ngram} = ($ht{RELEVANT_NGRAM}->{$ngram} || 0) + 1; + } + } + } + close(IN); + if ($optional_ngram_output_filename && open(OUT, ">$optional_ngram_output_filename")) { + foreach $ngram (sort keys %{$ht{RELEVANT_NGRAM}}) { + $count = $ht{RELEVANT_NGRAM}->{$ngram}; + next unless $count >= $ngram_count_min; + print OUT "($count) $ngram\n"; + $n_ngram_types++; + $n_ngram_instances += $count; + } + close(OUT); + print STDERR "Extracted $n_ngram_types ngram types, $n_ngram_instances ngram instances.\n"; + print STDERR "Wrote ngram stats to $optional_ngram_output_filename\n"; + } + } else { + print STDERR "Can't open relevant tok ngram corpus $filename\n"; + } +} + +sub load_relevant_tok_ngrams { + local($this, $filename, *ht) = @_; + + my $n_entries = 0; + if (open(IN, $filename)) { + while () { + s/\s*$//; + if (($count, $ngram) = ($_ =~ /^\((\d+)\)\s+(\S|\S.*\S)\s*$/)) { + $lc_ngram = lc $ngram; + $ht{RELEVANT_NGRAM}->{$lc_ngram} = ($ht{RELEVANT_NGRAM}->{$lc_ngram} || 0) + $count; + $ht{RELEVANT_LC_NGRAM}->{$lc_ngram} = ($ht{RELEVANT_LC_NGRAM}->{$lc_ngram} || 0) + $count; + $n_entries++; + } + } + close(IN); + print STDERR "Read in $n_entries entries from $filename\n"; + } else { + print STDERR "Can't open relevant tok ngrams from $filename\n"; + } +} + +sub snt_id_sort_function { + local($this, $a, $b) = @_; + + if ((($core_a, $index_a) = ($a =~ /^(\S+)\.(\d+)$/)) + && (($core_b, $index_b) = ($b =~ /^(\S+)\.(\d+)$/))) { + return ($core_a cmp $core_b) || ($index_a <=> $index_b); + } else { + return $a cmp $b; + } +} + +sub count_value_sort_function { + local($this, $a_count, $b_count, $a_value, $b_value, $control) = @_; + + # normalize fractions such as "1/2" + if ($a_count > $b_count) { + return ($control eq "decreasing") ? -1 : 1; + } elsif ($b_count > $a_count) { + return ($control eq "decreasing") ? 1 : -1; + } + $a_value = $num / $den if ($num, $den) = ($a_value =~ /^([1-9]\d*)\/([1-9]\d*)$/); + $b_value = $num / $den if ($num, $den) = ($b_value =~ /^([1-9]\d*)\/([1-9]\d*)$/); + $a_value =~ s/:/\./ if $a_value =~ /^\d+:\d+$/; + $b_value =~ s/:/\./ if $b_value =~ /^\d+:\d+$/; + if (($a_value =~ /^-?\d+(\.\d+)?$/) + && ($b_value =~ /^-?\d+(\.\d+)?$/)) { + return $a_value <=> $b_value; + } elsif ($a_value =~ /^-?\d+(\.\d+)?$/) { + return 1; + } elsif ($b_value =~ /^-?\d+(\.\d+)?$/) { + return -1; + } else { + return $a_value cmp $b_value; + } +} + +sub undef_to_blank { + local($this, $x) = @_; + + return (defined($x)) ? $x : ""; +} + +sub en_lex_amr_list { + local($this, $s) = @_; + + $bpe = qr{ \( (?: (?> [^()]+ ) | (??{ $bpe }))* \) }x; # see Perl Cookbook 2nd ed. p. 218 + @en_lex_amr_list = (); + my $amr_s; + my $lex; + my $test; + while ($s =~ /\S/) { + $s =~ s/^\s*//; + if (($s =~ /^\([a-z]\d* .*\)/) + && (($amr_s, $rest) = ($s =~ /^($bpe)(\s.*|)$/))) { + push(@en_lex_amr_list, $amr_s); + $s = $rest; + } elsif (($lex, $rest) = ($s =~ /^\s*(\S+)(\s.*|)$/)) { + push(@en_lex_amr_list, $lex); + $s = $rest; + } else { + print STDERR "en_lex_amr_list can't process: $s\n"; + $s = ""; + } + } + return @en_lex_amr_list; +} + +sub make_sure_dir_exists { + local($this, $dir, $umask) = @_; + + mkdir($dir, $umask) unless -d $dir; + chmod($umask, $dir); +} + +sub pretty_percentage { + local($this, $numerator, $denominator) = @_; + + return ($denominator == 0) ? "n/a" : ($this->round_to_n_decimal_places(100*$numerator/$denominator, 2) . "%"); +} + +sub html_color_nth_line { + local($this, $s, $n, $color, $delimiter) = @_; + + $delimiter = "
" unless defined($delimiter); + @lines = split($delimiter, $s); + $lines[$n] = "" . $lines[$n] . "" if ($n =~ /^\d+$/) && ($n <= $#lines); + return join($delimiter, @lines); +} + +sub likely_valid_url_format { + local($this, $url) = @_; + + $url = lc $url; + return 0 if $url =~ /\s/; + return 0 if $url =~ /[@]/; + return 1 if $url =~ /^https?:\/\/.+\.[a-z]+(\?.+)?$/; + return 1 if $url =~ /[a-z].+\.(com|edu|gov|net|org)$/; + return 0; +} + +# see also EnglMorph->special_token_type +$common_file_suffixes = "aspx?|cgi|docx?|gif|html?|jpeg|jpg|pdf|php|pptx?|txt|xml"; +$common_top_domain_suffixes = "museum|info|cat|com|edu|gov|int|mil|net|org|ar|at|au|be|bg|bi|br|ca|ch|cn|co|cz|de|dk|es|eu|fi|fr|gr|hk|hu|id|ie|il|in|ir|is|it|jp|ke|kr|lu|mg|mx|my|nl|no|nz|ph|pl|pt|ro|rs|ru|rw|se|sg|sk|so|tr|tv|tw|tz|ua|ug|uk|us|za"; + +sub token_is_url_p { + local($this, $token) = @_; + + return 1 if $token =~ /^www(\.[a-z0-9]([-a-z0-9_]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF])+)+\.([a-z]{2,2}|$common_top_domain_suffixes)(\/(\.{1,3}|[a-z0-9]([-a-z0-9_%]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF])+))*(\/[a-z0-9_][-a-z0-9_]+\.($common_file_suffixes))?$/i; + return 1 if $token =~ /^https?:\/\/([a-z]\.)?([a-z0-9]([-a-z0-9_]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF])+\.)+[a-z]{2,}(\/(\.{1,3}|([-a-z0-9_%]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF])+))*(\/[a-z_][-a-z0-9_]+\.($common_file_suffixes))?$/i; + return 1 if $token =~ /^[a-z][-a-z0-9_]+(\.[a-z][-a-z0-9_]+)*\.($common_top_domain_suffixes)(\/[a-z0-9]([-a-z0-9_%]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF])+)*(\/[a-z][-a-z0-9_]+\.($common_file_suffixes))?$/i; + return 0; +} + +sub token_is_email_p { + local($this, $token) = @_; + + return ($token =~ /^[a-z][-a-z0-9_]+(\.[a-z][-a-z0-9_]+)*\@[a-z][-a-z0-9_]+(\.[a-z][-a-z0-9_]+)*\.($common_top_domain_suffixes)$/i); +} + +sub token_is_filename_p { + local($this, $token) = @_; + + return 1 if $token =~ /\.($common_file_suffixes)$/; + return 0; +} + +sub token_is_xml_token_p { + local($this, $token) = @_; + + return ($token =~ /^&(amp|apos|gt|lt|nbsp|quot|&#\d+|&#x[0-9A-F]+);$/i); +} + +sub token_is_handle_p { + local($this, $token) = @_; + + return ($token =~ /^\@[a-z][_a-z0-9]*[a-z0-9]$/i); +} + +sub min { + local($this, @list) = @_; + + my $min = ""; + foreach $item (@list) { + $min = $item if ($item =~ /^-?\d+(?:\.\d*)?$/) && (($min eq "") || ($item < $min)); + } + return $min; +} + +sub max { + local($this, @list) = @_; + + my $max = ""; + foreach $item (@list) { + $max = $item if defined($item) && ($item =~ /^-?\d+(?:\.\d*)?(e[-+]\d+)?$/) && (($max eq "") || ($item > $max)); + } + return $max; +} + +sub split_tok_s_into_tokens { + local($this, $tok_s) = @_; + + @token_list = (); + while (($pre, $link_token, $post) = ($tok_s =~ /^(.*?)\s*(\@?<[^<>]+>\@?)\s*(.*)$/)) { + # generate dummy token for leading blank(s) + if (($tok_s =~ /^\s/) && ($pre eq "") && ($#token_list < 0)) { + push(@token_list, ""); + } else { + push(@token_list, split(/\s+/, $pre)); + } + push(@token_list, $link_token); + $tok_s = $post; + } + push(@token_list, split(/\s+/, $tok_s)); + return @token_list; +} + +sub shuffle { + local($this, @list) = @_; + + @shuffle_list = (); + while (@list) { + $len = $#list + 1; + $rand_position = int(rand($len)); + push(@shuffle_list, $list[$rand_position]); + splice(@list, $rand_position, 1); + } + $s = join(" ", @shuffle_list); + return @shuffle_list; +} + +sub timestamp_to_seconds { + local($this, $timestamp) = @_; + + my $epochtime; + if (($year, $month, $day, $hour, $minute, $second) = ($timestamp =~ /^(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d):(\d\d)$/)) { + $epochtime = timelocal($second, $minute, $hour, $day, $month-1, $year); + } elsif (($year, $month, $day) = ($timestamp =~ /^(\d\d\d\d)-(\d\d)-(\d\d)$/)) { + $epochtime = timelocal(0, 0, 0, $day, $month-1, $year); + } elsif (($year, $month, $day, $hour, $minute, $second, $second_fraction) = ($timestamp =~ /^(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d):(\d\d)\.(\d+)$/)) { + $epochtime = timelocal($second, $minute, $hour, $day, $month-1, $year) + ($second_fraction / (10 ** length($second_fraction))); + } else { + $epochtime = 0; + } + return $epochtime; +} + +sub timestamp_diff_in_seconds { + local($this, $timestamp1, $timestamp2) = @_; + + my $epochtime1 = $this->timestamp_to_seconds($timestamp1); + my $epochtime2 = $this->timestamp_to_seconds($timestamp2); + return $epochtime2 - $epochtime1; +} + +sub dirhash { + # maps string to hash of length 4 with characters [a-z2-8] (shorter acc. to $len) + local($this, $s, $len) = @_; + + $hash = 9999; + $mega = 2 ** 20; + $mega1 = $mega - 1; + $giga = 2 ** 26; + foreach $c (split //, $s) { + $hash = $hash*33 + ord($c); + $hash = ($hash >> 20) ^ ($hash & $mega1) if $hash >= $giga; + } + while ($hash >= $mega) { + $hash = ($hash >> 20) ^ ($hash & $mega1); + } + $result = ""; + while ($hash) { + $c = $hash & 31; + $result .= CORE::chr($c + (($c >= 26) ? 24 : 97)); + $hash = $hash >> 5; + } + while (length($result) < 4) { + $result .= "8"; + } + return substr($result, 0, $len) if $len; + return $result; +} + +sub full_path_python { + + foreach $bin_path (split(":", "/usr/sbin:/usr/bin:/bin:/usr/local/bin")) { + return $python if -x ($python = "$bin_path/python"); + } + return "python"; +} + +sub string_contains_unbalanced_paras { + local($this, $s) = @_; + + return 0 unless $s =~ /[(){}\[\]]/; + $rest = $s; + while (($pre,$left,$right,$post) = ($rest =~ /^(.*)([({\[]).*?([\]})])(.*)$/)) { + return 1 unless (($left eq "(") && ($right eq ")")) + || (($left eq "[") && ($right eq "]")) + || (($left eq "{") && ($right eq "}")); + $rest = "$pre$post"; + } + return 1 if $rest =~ /[(){}\[\]]/; + return 0; +} + +sub dequote_string { + local($this, $s) = @_; + + if ($s =~ /^".*"$/) { + $s = substr($s, 1, -1); + $s =~ s/\\"/"/g; + return $s; + } elsif ($s =~ /^'.*'$/) { + $s = substr($s, 1, -1); + $s =~ s/\\'/'/g; + return $s; + } else { + return $s; + } +} + +sub defined_non_space { + local($this, $s) = @_; + + return (defined($s) && ($s =~ /\S/)); +} + +sub default_if_undefined { + local($this, $s, $default) = @_; + + return (defined($s) ? $s : $default); +} + +sub remove_empties { + local($this, @list) = @_; + + @filtered_list = (); + foreach $elem (@list) { + push(@filtered_list, $elem) if defined($elem) && (! ($elem =~ /^\s*$/)) && (! $this->member($elem, @filtered_list)); + } + + return @filtered_list; +} + +# copied from AMRexp.pm +sub new_var_for_surf_amr { + local($this, $amr_s, $s) = @_; + + my $letter = ($s =~ /^[a-z]/i) ? lc substr($s, 0, 1) : "x"; + return $letter unless ($amr_s =~ /:\S+\s+\($letter\s+\//) + || ($amr_s =~ /\s\($letter\s+\//) + || ($amr_s =~ /^\s*\($letter\s+\//); # ))) + my $i = 2; + while (($amr_s =~ /:\S+\s+\($letter$i\s+\//) + || ($amr_s =~ /\s+\($letter$i\s+\//) + || ($amr_s =~ /^\s*\($letter$i\s+\//)) { # ))) + $i++; + } + return "$letter$i"; +} + +# copied from AMRexp.pm +sub new_vars_for_surf_amr { + local($this, $amr_s, $ref_amr_s) = @_; + + my $new_amr_s = ""; + my %new_var_ht = (); + my $remaining_amr_s = $amr_s; + my $pre; my $var; my $concept; my $post; + while (($pre, $var, $concept, $post) = ($remaining_amr_s =~ /^(.*?\()([a-z]\d*)\s+\/\s+([^ ()\s]+)(.*)$/s)) { + $new_var = $this->new_var_for_surf_amr("$ref_amr_s $new_amr_s", $concept); + $new_var_ht{$var} = $new_var; + $new_amr_s .= "$pre$new_var / $concept"; + $remaining_amr_s = $post; + } + $new_amr_s .= $remaining_amr_s; + + # also update any reentrancy variables + $remaining_amr_s = $new_amr_s; + $new_amr_s2 = ""; + while (($pre, $var, $post) = ($remaining_amr_s =~ /^(.*?:\S+\s+)([a-z]\d*)([ ()\s].*)$/s)) { + $new_var = $new_var_ht{$var} || $var; + $new_amr_s2 .= "$pre$new_var"; + $remaining_amr_s = $post; + } + $new_amr_s2 .= $remaining_amr_s; + + return $new_amr_s2; +} + +sub update_inner_span_for_id { + local($this, $html_line, $slot, $new_value) = @_; + # e.g. slot: workset-language-name value: Uyghur + + if (defined($new_value) + && (($pre, $old_value, $post) = ($html_line =~ /^(.*]* id="$slot"[^<>]*>)([^<>]*)(<\/span\b[^<>]*>.*)$/i)) + && ($old_value ne $new_value)) { + # print STDERR "Inserting new $slot $old_value -> $new_value\n"; + return $pre . $new_value . $post . "\n"; + } else { + # no change + return $html_line; + } +} + +sub levenshtein_distance { + local($this, $s1, $s2) = @_; + + my $i; + my $j; + my @distance; + my @s1_chars = $utf8->split_into_utf8_characters($s1, "return only chars", *empty_ht); + my $s1_length = $#s1_chars + 1; + my @s2_chars = $utf8->split_into_utf8_characters($s2, "return only chars", *empty_ht); + my $s2_length = $#s2_chars + 1; + for ($i = 0; $i <= $s1_length; $i++) { + $distance[$i][0] = $i; + } + for ($j = 1; $j <= $s2_length; $j++) { + $distance[0][$j] = $j; + } + for ($j = 1; $j <= $s2_length; $j++) { + for ($i = 1; $i <= $s1_length; $i++) { + my $substitution_cost = ($s1_chars[$i-1] eq $s2_chars[$j-1]) ? 0 : 1; + $distance[$i][$j] = $this->min($distance[$i-1][$j] + 1, + $distance[$i][$j-1] + 1, + $distance[$i-1][$j-1] + $substitution_cost); + # print STDERR "SC($i,$j) = $substitution_cost\n"; + # $d = $distance[$i][$j]; + # print STDERR "D($i,$j) = $d\n"; + } + } + return $distance[$s1_length][$s2_length]; +} + +sub markup_parts_of_string_in_common_with_ref { + local($this, $s, $ref, $start_markup, $end_markup, $deletion_markup, $verbose) = @_; + + # \x01 temporary start-markup + # \x02 temporary end-markup + # \x03 temporary deletion-markup + $s =~ s/[\x01-\x03]//g; + $ref =~ s/[\x01-\x03]//g; + my $i; + my $j; + my @distance; + my @s_chars = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht); + my $s_length = $#s_chars + 1; + my @ref_chars = $utf8->split_into_utf8_characters($ref, "return only chars", *empty_ht); + my $ref_length = $#ref_chars + 1; + $distance[0][0] = 0; + $del_ins_subst_op[0][0] = "-"; + for ($i = 1; $i <= $s_length; $i++) { + $distance[$i][0] = $i; + $del_ins_subst_op[$i][0] = 0; + } + for ($j = 1; $j <= $ref_length; $j++) { + $distance[0][$j] = $j; + $del_ins_subst_op[0][$j] = 1; + } + for ($j = 1; $j <= $ref_length; $j++) { + for ($i = 1; $i <= $s_length; $i++) { + my $substitution_cost = (($s_chars[$i-1] eq $ref_chars[$j-1])) ? 0 : 1; + my @del_ins_subst_list = ($distance[$i-1][$j] + 1, + $distance[$i][$j-1] + 1, + $distance[$i-1][$j-1] + $substitution_cost); + my $min = $this->min(@del_ins_subst_list); + my $del_ins_subst_position = $this->position($min, @del_ins_subst_list); + $distance[$i][$j] = $min; + $del_ins_subst_op[$i][$j] = $del_ins_subst_position; + } + } + $d = $distance[$s_length][$ref_length]; + print STDERR "markup_parts_of_string_in_common_with_ref LD($s,$ref) = $d\n" if $verbose; + for ($j = 0; $j <= $ref_length; $j++) { + for ($i = 0; $i <= $s_length; $i++) { + $d = $distance[$i][$j]; + $op = $del_ins_subst_op[$i][$j]; + print STDERR "$d($op) " if $verbose; + } + print STDERR "\n" if $verbose; + } + my $result = ""; + my $i_end = $s_length; + my $j_end = $ref_length; + my $cost = $distance[$i_end][$j_end]; + $i = $i_end; + $j = $j_end; + while (1) { + $result2 = $result; + $result2 =~ s/\x01/$start_markup/g; + $result2 =~ s/\x02/$end_markup/g; + $result2 =~ s/\x03/$deletion_markup/g; + print STDERR "i:$i i-end:$i_end j:$j j-end:$j_end r: $result2\n" if $verbose; + # matching characters + if ($i && $j && ($del_ins_subst_op[$i][$j] == 2) && ($distance[$i-1][$j-1] == $distance[$i][$j])) { + $i--; + $j--; + } else { + # previously matching characters + if (($i < $i_end) && ($j < $j_end)) { + my $sub_s = join("", @s_chars[$i .. $i_end-1]); + $result = "\x01" . $sub_s . "\x02" . $result; + } + # character substitution + if ($i && $j && ($del_ins_subst_op[$i][$j] == 2)) { + $i--; + $j--; + $result = $s_chars[$i] . $result; + } elsif ($i && ($del_ins_subst_op[$i][$j] == 0)) { + $i--; + $result = $s_chars[$i] . $result; + } elsif ($j && ($del_ins_subst_op[$i][$j] == 1)) { + $j--; + $result = "\x03" . $result; + } else { + last; + } + $i_end = $i; + $j_end = $j; + } + } + $result2 = $result; + $result2 =~ s/\x01/$start_markup/g; + $result2 =~ s/\x02/$end_markup/g; + $result2 =~ s/\x03/$deletion_markup/g; + print STDERR "i:$i i-end:$i_end j:$j j-end:$j_end r: $result2 *\n" if $verbose; + $result =~ s/(\x02)\x03+(\x01)/$1$deletion_markup$2/g; + $result =~ s/(\x02)\x03+$/$1$deletion_markup/g; + $result =~ s/^\x03+(\x01)/$deletion_markup$1/g; + $result =~ s/\x03//g; + $result =~ s/\x01/$start_markup/g; + $result =~ s/\x02/$end_markup/g; + return $result; +} + +sub env_https { + my $https = $ENV{'HTTPS'}; + return 1 if $https && ($https eq "on"); + + my $http_via = $ENV{'HTTP_VIA'}; + return 1 if $http_via && ($http_via =~ /\bHTTPS\b.* \d+(?:\.\d+){3,}:443\b/); # tmp for beta.isi.edu + + return 0; +} + +sub env_http_host { + return $ENV{'HTTP_HOST'} || ""; +} + +sub env_script_filename { + return $ENV{'SCRIPT_FILENAME'} || ""; +} + +sub cgi_mt_app_root_dir { + local($this, $target) = @_; + my $s; + if ($target =~ /filename/i) { + $s = $ENV{'SCRIPT_FILENAME'} || ""; + } else { + $s = $ENV{'SCRIPT_NAME'} || ""; + } + return "" unless $s; + return $d if ($d) = ($s =~ /^(.*?\/(?:amr-editor|chinese-room-editor|utools|romanizer\/version\/[-.a-z0-9]+|romanizer))\//); + return $d if ($d) = ($s =~ /^(.*)\/(?:bin|src|scripts?)\/[^\/]*$/); + return $d if ($d) = ($s =~ /^(.*)\/[^\/]*$/); + return ""; +} + +sub parent_dir { + local($this, $dir) = @_; + + $dir =~ s/\/[^\/]+\/?$//; + return $dir || "/"; +} + +sub span_start { + local($this, $span, $default) = @_; + + $default = "" unless defined($default); + return (($start) = ($span =~ /^(\d+)-\d+$/)) ? $start : $default; +} + +sub span_end { + local($this, $span, $default) = @_; + + $default = "" unless defined($default); + return (($end) = ($span =~ /^\d+-(\d+)$/)) ? $end : $default; +} + +sub oct_mode { + local($this, $filename) = @_; + + @stat = stat($filename); + return "" unless @stat; + $mode = $stat[2]; + $oct_mode = sprintf("%04o", $mode & 07777); + return $oct_mode; +} + +sub csv_to_list { + local($this, $s, $control_string) = @_; + # Allow quoted string such as "Wait\, what?" as element with escaped comma inside. + + $control_string = "" unless defined($control_string); + $strip_p = ($control_string =~ /\bstrip\b/); + $allow_simple_commas_in_quote = ($control_string =~ /\bsimple-comma-ok\b/); + $ignore_empty_elem_p = ($control_string =~ /\bno-empty\b/); + @cvs_list = (); + while ($s ne "") { + if ((($elem, $rest) = ($s =~ /^"((?:\\[,\"]|[^,\"][\x80-\xBF]*)*)"(,.*|)$/)) + || ($allow_simple_commas_in_quote + && (($elem, $rest) = ($s =~ /^"((?:\\[,\"]|[^\"][\x80-\xBF]*)*)"(,.*|)$/))) + || (($elem, $rest) = ($s =~ /^([^,]*)(,.*|\s*)$/)) + || (($elem, $rest) = ($s =~ /^(.*)()$/))) { + if ($strip_p) { + $elem =~ s/^\s*//; + $elem =~ s/\s*$//; + } + push(@cvs_list, $elem) unless $ignore_empty_elem_p && ($elem eq ""); + $rest =~ s/^,//; + $s = $rest; + } else { + print STDERR "Error in csv_to_list processing $s\n"; + last; + } + } + return @cvs_list; +} + +sub kl_divergence { + local($this, $distribution_id, $gold_distribution_id, *ht, $smoothing) = @_; + + my $total_count = $ht{DISTRIBUTION_TOTAL_COUNT}->{$distribution_id}; + my $total_gold_count = $ht{DISTRIBUTION_TOTAL_COUNT}->{$gold_distribution_id}; + return unless $total_count && $total_gold_count; + + my @values = keys %{$ht{DISTRIBUTION_VALUE_COUNT}->{$gold_distribution_id}}; + my $n_values = $#values + 1; + + my $min_total_count = $this->min($total_count, $total_gold_count); + $smoothing = 1 - (10000/((100+$min_total_count)**2)) unless defined($smoothing); + return unless $smoothing; + my $smoothed_n_values = $smoothing * $n_values; + my $divergence = 0; + foreach $value (@values) { + my $count = $ht{DISTRIBUTION_VALUE_COUNT}->{$distribution_id}->{$value} || 0; + my $gold_count = $ht{DISTRIBUTION_VALUE_COUNT}->{$gold_distribution_id}->{$value}; + my $p = ($count + $smoothing) / ($total_count + $smoothed_n_values); + my $q = ($gold_count + $smoothing) / ($total_gold_count + $smoothed_n_values); + if ($p == 0) { + # no impact on divergence + } elsif ($q) { + my $incr = $p * CORE::log($p/$q); + $divergence += $incr; + my $incr2 = $this->round_to_n_decimal_places($incr, 5); + my $p2 = $this->round_to_n_decimal_places($p, 5); + my $q2 = $this->round_to_n_decimal_places($q, 5); + $incr2 = "+" . $incr2 if $incr > 0; + $log = " value: $value count: $count gold_count: $gold_count p: $p2 q: $q2 $incr2\n"; + $ht{KL_DIVERGENCE_LOG}->{$distribution_id}->{$gold_distribution_id}->{$value} = $log; + $ht{KL_DIVERGENCE_INCR}->{$distribution_id}->{$gold_distribution_id}->{$value} = $incr; + } else { + $divergence += 999; + } + } + return $divergence; +} + +sub read_ISO_8859_named_entities { + local($this, *ht, $filename, $verbose) = @_; + # e.g. from /nfs/isd/ulf/arabic/data/ISO-8859-1-HTML-named-entities.txt + + my $n = 0; + if (open(IN, $filename)) { + while () { + if (($name, $dec_unicode) = ($_ =~ /^{$name} = $dec_unicode; + $ht{HTML_ENTITY_DECUNICODE_TO_NAME}->{$dec_unicode} = $name; + $n++; + print STDERR "read_ISO_8859_named_entities $name $dec_unicode .\n" if $name =~ /dash/; + } + } + close(IN); + print STDERR "Loaded $n entries from $filename\n" if $verbose; + } else { + print STDERR "Could not open $filename\n" if $verbose; + } +} + +sub neg { + local($this, $x) = @_; + + # robust + return (defined($x) && ($x =~ /^-?\d+(?:\.\d+)?$/)) ? (- $x) : $x; +} + +sub read_ttable_gloss_data { + local($this, $filename, $lang_code, *ht, $direction) = @_; + # e.g. /nfs/isd/ulf/croom/oov-lanpairs/som-eng/som-eng-ttable-glosses.txt + + $direction = "f to e" unless defined($direction); + if (open(IN, $filename)) { + while () { + if (($headword, $gloss) = ($_ =~ /^(.*?)\t(.*?)\s*$/)) { + if ($direction eq "e to f") { + $ht{TTABLE_E_GLOSS}->{$lang_code}->{$headword} = $gloss; + } else { + $ht{TTABLE_F_GLOSS}->{$lang_code}->{$headword} = $gloss; + } + } + } + close(IN); + } +} + +sub format_gloss_for_tooltop { + local($this, $gloss) = @_; + + $gloss =~ s/^\s*/\t/; + $gloss =~ s/\s*$//; + $gloss =~ s/ / /g; + $gloss =~ s/\t/ /g; + return $gloss; +} + +sub obsolete_tooltip { + local($this, $s, $lang_code, *ht) = @_; + + return $gloss if defined($gloss = $ht{TTABLE_F_GLOSS}->{$lang_code}->{$s}); + @e_s = sort { $ht{T_TABLE_F_E_C}->{$lang_code}->{$s}->{$b} + <=> $ht{T_TABLE_F_E_C}->{$lang_code}->{$s}->{$a} } + keys %{$ht{T_TABLE_F_E_C}->{$lang_code}->{$s}}; + if (@e_s) { + $e = shift @e_s; + $count = $ht{T_TABLE_F_E_C}->{$lang_code}->{$s}->{$e}; + $min_count = $this->max($count * 0.01, 1.0); + $count =~ s/(\.\d\d)\d*$/$1/; + $result = "$s: $e ($count)"; + $n = 1; + while (@e_s) { + $e = shift @e_s; + $count = $ht{T_TABLE_F_E_C}->{$lang_code}->{$s}->{$e}; + last if $count < $min_count; + $count =~ s/(\.\d\d)\d*$/$1/; + $result .= " $e ($count)"; + $n++; + last if $n >= 10; + } + $ht{TTABLE_F_GLOSS}->{$lang_code}->{$s} = $result; + return $result; + } else { + return ""; + } +} + +sub markup_html_line_init { + local($this, $s, *ht, $id) = @_; + + my @chars = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht); + $ht{S}->{$id} = $s; +} + +sub markup_html_line_regex { + local($this, $id, *ht, $regex, $m_slot, $m_value, *LOG) = @_; + + unless ($regex eq "") { + my $s = $ht{S}->{$id}; + my $current_pos = 0; + while (($pre, $match_s, $post) = ($s =~ /^(.*?)($regex)(.*)$/)) { + $current_pos += $utf8->length_in_utf8_chars($pre); + my $match_len = $utf8->length_in_utf8_chars($match_s); + $ht{START}->{$id}->{$current_pos}->{$m_slot}->{$m_value} = 1; + $ht{STOP}->{$id}->{($current_pos+$match_len)}->{$m_slot}->{$m_value} = 1; + $current_pos += $match_len; + $s = $post; + } + } +} + +sub html_markup_line { + local($this, $id, *ht, *LOG) = @_; + + my @titles = (); + my @colors = (); + my @text_decorations = (); + + my $s = $ht{S}->{$id}; + # print LOG "html_markup_line $id: $s\n"; + my @chars = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht); + my $markedup_s = ""; + + my $new_title = ""; + my $new_color = ""; + my $new_text_decoration = ""; + my $n_spans = 0; + my $i; + foreach $i ((0 .. ($#chars+1))) { + my $stop_span_p = 0; + foreach $m_slot (keys %{$ht{STOP}->{$id}->{$i}}) { + foreach $m_value (keys %{$ht{STOP}->{$id}->{$i}->{$m_slot}}) { + if ($m_slot eq "title") { + my $last_positition = $this->last_position($m_value, @titles); + splice(@titles, $last_positition, 1) if $last_positition >= 0; + $stop_span_p = 1; + } elsif ($m_slot eq "color") { + my $last_positition = $this->last_position($m_value, @colors); + splice(@colors, $last_positition, 1) if $last_positition >= 0; + $stop_span_p = 1; + } elsif ($m_slot eq "text-decoration") { + my $last_positition = $this->last_position($m_value, @text_decorations); + splice(@text_decorations, $last_positition, 1) if $last_positition >= 0; + $stop_span_p = 1; + } + } + } + if ($stop_span_p) { + $markedup_s .= ""; + $n_spans--; + } + my $start_span_p = 0; + foreach $m_slot (keys %{$ht{START}->{$id}->{$i}}) { + foreach $m_value (keys %{$ht{START}->{$id}->{$i}->{$m_slot}}) { + if ($m_slot eq "title") { + push(@titles, $m_value); + $start_span_p = 1; + } elsif ($m_slot eq "color") { + push(@colors, $m_value); + $start_span_p = 1; + } elsif ($m_slot eq "text-decoration") { + push(@text_decorations, $m_value); + $start_span_p = 1; + } + } + } + if ($stop_span_p || $start_span_p) { + my $new_title = (@titles) ? $titles[$#titles] : ""; + my $new_color = (@colors) ? $colors[$#colors] : ""; + my $new_text_decoration = (@text_decorations) ? $text_decorations[$#text_decorations] : ""; + if ($new_title || $new_color || $new_text_decoration) { + my $args = ""; + if ($new_title) { + $g_title = $this->guard_html_quote($new_title); + $args .= " title=\"$g_title\""; + } + if ($new_color || $new_text_decoration) { + $g_color = $this->guard_html_quote($new_color); + $g_text_decoration = $this->guard_html_quote($new_text_decoration); + $color_clause = ($new_color) ? "color:$g_color;" : ""; + $text_decoration_clause = ($new_text_decoration) ? "text-decoration:$g_text_decoration;" : ""; + $text_decoration_clause =~ s/text-decoration:(border-bottom:)/$1/g; + $args .= " style=\"$color_clause$text_decoration_clause\""; + } + if ($n_spans) { + $markedup_s .= ""; + $n_spans--; + } + $markedup_s .= ""; + $n_spans++; + } + } + $markedup_s .= $chars[$i] if $i <= $#chars; + } + print LOG "Error in html_markup_line $id final no. of open spans: $n_spans\n" if $n_spans; + return $markedup_s; +} + +sub offset_adjustment { + local($this, $g, $s, $offset, $snt_id, *ht, *LOG, $control) = @_; + # s(tring) e.g. "can't" + # g(old string) e.g. "can not" + # Typically when s is a slight variation of g (e.g. with additional tokenization spaces in s) + # returns mapping 0->0, 1->1, 2->2, 3->3, 6->4, 7->5 + + $control = "" unless defined($control); + my $verbose = ($control =~ /\bverbose\b/); + my $s_offset = 0; + my $g_offset = 0; + my @s_chars = $utf8->split_into_utf8_characters($s, "return only chars", *ht); + my @g_chars = $utf8->split_into_utf8_characters($g, "return only chars", *ht); + my $s_len = $#s_chars + 1; + my $g_len = $#g_chars + 1; + $ht{OFFSET_MAP}->{$snt_id}->{$offset}->{$s_offset} = $g_offset; + $ht{OFFSET_MAP}->{$snt_id}->{$offset}->{($s_offset+$s_len)} = $g_offset+$g_len; + + while (($s_offset < $s_len) && ($g_offset < $g_len)) { + if ($s_chars[$s_offset] eq $g_chars[$g_offset]) { + $s_offset++; + $g_offset++; + $ht{OFFSET_MAP}->{$snt_id}->{$offset}->{$s_offset} = $g_offset; + } else { + my $best_gm = 0; + my $best_sm = 0; + my $best_match_len = 0; + foreach $max_m ((1 .. 4)) { + foreach $sm ((0 .. $max_m)) { + $max_match_len = 0; + while ((($s_index = $s_offset+$sm+$max_match_len) < $s_len) + && (($g_index = $g_offset+$max_m+$max_match_len) < $g_len)) { + if ($s_chars[$s_index] eq $g_chars[$g_index]) { + $max_match_len++; + } else { + last; + } + } + if ($max_match_len > $best_match_len) { + $best_match_len = $max_match_len; + $best_sm = $sm; + $best_gm = $max_m; + } + } + foreach $gm ((0 .. $max_m)) { + $max_match_len = 0; + while ((($s_index = $s_offset+$max_m+$max_match_len) < $s_len) + && (($g_index = $g_offset+$gm+$max_match_len) < $g_len)) { + if ($s_chars[$s_index] eq $g_chars[$g_index]) { + $max_match_len++; + } else { + last; + } + } + if ($max_match_len > $best_match_len) { + $best_match_len = $max_match_len; + $best_sm = $max_m; + $best_gm = $gm; + } + } + } + if ($best_match_len) { + $s_offset += $best_sm; + $g_offset += $best_gm; + $ht{OFFSET_MAP}->{$snt_id}->{$offset}->{$s_offset} = $g_offset; + } else { + last; + } + } + } + if ($verbose) { + foreach $s_offset (sort { $a <=> $b } + keys %{$ht{OFFSET_MAP}->{$snt_id}->{$offset}}) { + my $g_offset = $ht{OFFSET_MAP}->{$snt_id}->{$offset}->{$s_offset}; + print LOG " OFFSET_MAP $snt_id.$offset $s/$g $s_offset -> $g_offset\n"; + } + } +} + +1; diff --git a/v1.3.7/lib/NLP/xml.pm b/v1.3.7/lib/NLP/xml.pm new file mode 100755 index 0000000..bf4bbf0 --- /dev/null +++ b/v1.3.7/lib/NLP/xml.pm @@ -0,0 +1,802 @@ +################################################################ +# # +# xml # +# # +################################################################ + +package NLP::xml; + +use NLP::SntSegmenter; +use NLP::UTF8; +use NLP::utilities; + +$snt_segm = NLP::SntSegmenter; +$utf8 = NLP::UTF8; +$util = NLP::utilities; + +sub read_xml_file { + local($this, $filename, *ht, $xml_id, $schema, $control) = @_; + + $control = "" unless defined($control); + my $s = $util->read_file($filename); + $s =~ s/<\/?a\b[^<>]*\/?>//ig if $control =~ /-a\b/; + $s =~ s/<\/?img\b[^<>]*\/?>//ig if $control =~ /-img\b/; + $this->read_xml($s, *ht, $xml_id, $schema, $control); +} + +sub n_newlines { + local($this, @strings) = @_; + + my $n = 0; + foreach $s (@strings) { + $n += (() = ($s =~ /\n/g)); + } + return $n; +} + +sub level_of_tag_nesting { + local($this, *ht, $xml_id, $node_id, $ref_tag) = @_; + + my $n = 0; + while ($node_id) { + my $tag = $ht{$xml_id}->{TAG}->{$node_id}; + $n++ if $tag && ($tag eq $ref_tag); + $node_id = $ht{$xml_id}->{PARENT}->{$node_id}; + } + return $n; +} + +sub read_xml { + local($this, $s, *ht, $xml_id, $schema, $control) = @_; + + my $ping_every_n_lines = 100; + my $next_ping_at = 0; + my $line_number = 1; + my $nesting_level = 0; + my $printed_message_p = 0; + my $parent_node_id = "1"; + $ht{$xml_id}->{N_SUBS}->{$parent_node_id} = 0; + $ht{$xml_id}->{SCHEMA} = $schema; + $control = "" unless defined($control); + my $prev_text = ""; + $s =~ s/(]*))?) xlink:(href=)/$1 $2/g; # xlink:href -> href + # $tmp = $s; + # $tmp =~ s/.*(]*>).*/$1/s; + # print STDERR "EXT-LINK: $tmp\n"; + while ($s ne "") { + if ($line_number >= $next_ping_at) { + print STDERR "."; + $next_ping_at += $ping_every_n_lines; + } + $s =~ s/^\xEF\xBB\xBF//; + # # comment + if (($tag, $rest) = ($s =~ /^(\s*)(.*)$/s)) { + $line_number += $this->n_newlines($tag); + $prev_text = $tag; + # open tag + } elsif (($pre, $tag, $arg_s, $close_s, $rest) = ($s =~ /^(\s*)<([a-zA-Z][-_:a-zA-Z0-9]*)([^<>]*?)(\s*\/|)>(.*)$/s)) { + $line_number += $this->n_newlines($pre); + $tag = lc $tag; + $nesting_level++; + $ht{$xml_id}->{N_SUBTAGS}->{$parent_node_id} = ($ht{$xml_id}->{N_SUBTAGS}->{$parent_node_id} || 0) + 1; + $n_subs = $ht{$xml_id}->{N_SUBS}->{$parent_node_id} || 0; + $n_subs++; + $ht{$xml_id}->{N_SUBS}->{$parent_node_id} = $n_subs; + $node_id = join(".", $parent_node_id,$n_subs); + $ht{$xml_id}->{PARENT}->{$node_id} = $parent_node_id; + $ht{$xml_id}->{N_SUBS}->{$node_id} = 0; + $ht{$xml_id}->{TAG}->{$node_id} = $tag; + # print STDERR " OPEN <$tag> $node_id (l.$line_number)\n"; + $ht{$xml_id}->{START_LINE}->{$node_id} = $line_number; + while (($pre, $slot, $value, $arg_rest) = ($arg_s =~ /^(\s*)([a-zA-Z][-:_a-zA-Z0-9]*)=("[^"]*"|'[^']*')(.*)$/s)) { + if ($value =~ /^".*"$/) { + $value =~ s/^"//; + $value =~ s/"$//; + } elsif ($value =~ /^'.*'$/) { + $value =~ s/^'//; + $value =~ s/'$//; + } + $line_number += $this->n_newlines($pre); + $arg_s = $arg_rest; + $slot = lc $slot; + $ht{$xml_id}->{ARG}->{$node_id}->{$slot} = $value; + $line_number += $this->n_newlines($slot, $value); + } + if ($arg_s =~ /\S/) { + print STDERR " Unprocessed (remaining) tag arg string '$arg_s' in line $line_number in $xml_id\n"; + $printed_message_p = 1; + } + $n_tags = $ht{$xml_id}->{N_TAGS}->{$tag} || 0; + $n_tags++; + $ht{$xml_id}->{N_TAGS}->{$tag} = $n_tags; + $ht{$xml_id}->{NTH_TAG}->{$tag}->{$n_tags} = $node_id; + if ($close_s ne "") { + $ht{$xml_id}->{END_LINE}->{$node_id} = $line_number; + $nesting_level--; + } else { + $parent_node_id = $node_id; + } + $prev_text = "<" . "$tag$arg_s$close_s" . ">"; + # close tag + } elsif (($pre, $tag, $rest) = ($s =~ /^(\s*)<\/([a-zA-Z][-_:a-zA-Z0-9]*)>(.*)$/s)) { + $line_number += $this->n_newlines($pre); + # print STDERR " CLOSE <\/$tag> $parent_node_id (l.$line_number)\n"; + $tag = lc $tag; + $open_tag = $ht{$xml_id}->{TAG}->{$parent_node_id} || "?"; + if ($tag ne $open_tag) { + $open_tag_start = $ht{$xml_id}->{START_LINE}->{$parent_node_id} || "?"; + print STDERR "Ignoring close tag $tag (line $line_number in $xml_id), because it does not match open tag $open_tag (line $open_tag_start)\n"; + $printed_message_p = 1; + # print LOG "Ignoring close tag $tag (line $line_number in $xml_id), because it does not match open tag $open_tag (line $open_tag_start)\n" if defined LOG; + } else { + $parent_node_id = $ht{$xml_id}->{PARENT}->{$parent_node_id} || 0; + $nesting_level--; + } + $prev_text = ""; + # special/bad tag + } elsif (($pre, $tag, $rest) = ($s =~ /^(\s*)(<[^<>]*>?)(.*)$/s)) { + $line_number += $this->n_newlines($pre); + # + if ($tag =~ /^\s*]*>/s) { + $ht{$xml_id}->{DOCTYPE} = $doctype if ($doctype) = ($tag =~ /^/s); + $ht{$xml_id}->{SYSTEM} = $system if ($system) = ($tag =~ / SYSTEM\s+\"([^"]*)\"/s); + # + } elsif ($tag =~ /^\s*<\?xml\s[^<>]*\?>/s) { + $ht{$xml_id}->{XML_VERSION} = $version if ($version) = ($tag =~ /^<\?xml\s+[^<>]*bversion="([^"]+)"[^<>]*\?>$/s); + } elsif ($tag =~ /^\s*<\?properties\s[^<>]*\?>/s) { + $ht{$xml_id}->{PROPERTIES} = $properties if ($properties) = ($tag =~ /^<\?properties\s+([^<>]*?)\s*\?>$/s); + } elsif ($tag =~ /^\s*<\?supplied-pmid\s[^<>]*\?>/s) { + # ignore pmid of referenced papers + } else { + print STDERR "Ignoring unrecognized tag $tag in line $line_number of $xml_id\n"; + $printed_message_p = 1; + # print LOG "Ignoring unrecognized tag $tag in line $line_number\n" if defined LOG; + } + $line_number += $this->n_newlines($tag); + $prev_text = $tag; + # text + } elsif (($text, $rest) = ($s =~ /^([^<>]+)(.*)$/s)) { + $ht{$xml_id}->{N_SUBTEXTS}->{$parent_node_id} = ($ht{$xml_id}->{N_SUBTEXTS}->{$parent_node_id} || 0) + 1; + $n_subs = $ht{$xml_id}->{N_SUBS}->{$parent_node_id} || 0; + $n_subs++; + $ht{$xml_id}->{N_SUBS}->{$parent_node_id} = $n_subs; + $node_id = join(".", $parent_node_id,$n_subs); + $ht{$xml_id}->{PARENT}->{$node_id} = $parent_node_id; + $ht{$xml_id}->{N_SUBS}->{$node_id} = 0; + $ht{$xml_id}->{TAG}->{$node_id} = "TEXT"; + $ht{$xml_id}->{TEXT}->{$node_id} = $text; + $line_number += $this->n_newlines($text); + $prev_text = $text; + # final catch all (should never get here) + } else { + $s = substr($s, 0, 160) . "....." if length($s) > 160; + print STDERR "Ignoring rest starting in $xml_id line $line_number: $prev_text *FROM-HERE->* $s\n"; + $printed_message_p = 1; + # print LOG "Ignoring rest starting in $xml_id line $line_number: $prev_text *FROM-HERE->* $s\n" if defined(LOG); + $rest = ""; + $prev_text = $s; + } + $s = $rest; + } + if ($nesting_level) { + print STDERR "Unbalanced xml (ending with nesting level $nesting_level) at line $line_number in $xml_id \n"; + $printed_message_p = 1; + } + # print LOG "Unbalanced xml (ending with nesting level $nesting_level)\n" if $nesting_level && defined(LOG); + $line_number--; + unless ($control =~ /\bsilent\b/) { + print STDERR "Read in $line_number lines.\n"; + $printed_message_p = 1; + } + # print LOG "Read in $line_number lines.\n" unless ($control =~ /\bsilent\b/) || (! defined(LOG)); + print STDERR "\n" if $printed_message_p; +} + +sub arg_value { + local($this, $node_id, *ht, $xml_id, $slot) = @_; + + return (defined($value = $ht{$xml_id}->{ARG}->{$node_id}->{$slot})) ? $value : ""; +} + +sub text_value { + local($this, $node_id, *ht, $xml_id) = @_; + + my $value = $ht{$xml_id}->{TEXT}->{$node_id}; + return $value if defined($value); + my $text_node_id = $this->sub_node_of_tag($node_id, *ht, $xml_id, 0, "TEXT"); + return "" unless $text_node_id; + $value = $ht{$xml_id}->{TEXT}->{$text_node_id}; + return $value if defined($value); + return ""; +} + +sub tag_value { + local($this, $node_id, *ht, $xml_id) = @_; + return $ht{$xml_id}->{TAG}->{$node_id} || ""; +} + +sub write_xml { + local($this, $parent_node_id, *ht, $xml_id, $schema, $indent, $rec_p) = @_; + # converts ht structure to string + # root has $parent_node_id = "1"; + + $indent = 0 unless defined($indent); + $rec_p = 0 unless defined($rec_p); + # print STDERR "write_xml($parent_node_id) REC: $rec_p\n"; + my $result = ""; + my $tag = $ht{$xml_id}->{TAG}->{$parent_node_id} || ""; + my $n_subs = $ht{$xml_id}->{N_SUBS}->{$parent_node_id} || 0; + if ($tag) { + if ($tag eq "TEXT") { + $result .= $ht{$xml_id}->{TEXT}->{$parent_node_id}; + } else { + if ($indent) { + $result .= "\n" if $rec_p; + my $n_periods = (() = ($parent_node_id =~ /\./g)); + foreach $i ((2 .. $n_periods)) { + foreach $j ((1 .. $indent)) { + $result .= " "; + } + } + } + $result .= "<$tag"; + foreach $slot (sort keys %{$ht{$xml_id}->{ARG}->{$parent_node_id}}) { + $value = $ht{$xml_id}->{ARG}->{$parent_node_id}->{$slot}; + $result .= " $slot=\"$value\""; + } + $result .= " \/" unless $n_subs; + $result .= ">"; + } + } + foreach $i ((1 .. $n_subs)) { + my $node_id = join(".", $parent_node_id,$i); + my $tag = $ht{$xml_id}->{TAG}->{$node_id}; + if ($tag eq "TEXT") { + $result .= $ht{$xml_id}->{TEXT}->{$node_id}; + } else { + $result .= $this->write_xml($node_id, *ht, $xml_id, $schema, $indent, 1); + } + } + if ($n_subs) { + if ($indent && $ht{$xml_id}->{N_SUBTAGS}->{$parent_node_id}) { + $result .= "\n"; + my $n_periods = (() = ($parent_node_id =~ /\./g)); + foreach $i ((2 .. $n_periods)) { + foreach $j ((1 .. $indent)) { + $result .= " "; + } + } + } + $result .= "<\/$tag>" if $tag && ($tag ne "TEXT"); + } + return $result; +} + +sub write_xml_without_tags_at_ends { + local($this, $parent_node_id, *ht, $xml_id, $schema, $indent, $rec_p) = @_; + + return $this->trim_xml_tags_at_ends($this->write_xml($parent_node_id, *ht, $xml_id, $schema, $indent, $rec_p)); +} + +sub trim_xml_tags_at_ends { + local($this, $s) = @_; + + $s =~ s/^\s*<[^<>]*>//; + $s =~ s/<[^<>]*>\s*$//; + return $s; +} + +sub sub_node_of_tag { + local($this, $node_id, *ht, $xml_id, $rec_p, @tags) = @_; + + my @sub_nodes = $this->sub_nodes_of_tag($node_id, *ht, $xml_id, $rec_p, @tags); + return (@sub_nodes) ? $sub_nodes[0] : ""; +} + +sub sub_nodes_of_tag { + local($this, $node_id, *ht, $xml_id, $rec_p, @tags) = @_; + # Return (space separated list of) node IDs of nodes under $node_id with tag in @tags. + # If $rec_p, include nodes of tag in @tags inside node of tag in @tags. + # For any node, search under TOP $node_id "1". + + my $tag_s = join(" ", @tags); + @{$ht{$xml_id}->{SUB_NODES_OF_TYPE}->{$node_id}->{$tag_s}} = (); + $this->sub_nodes_of_tag_rec($node_id, $node_id, *ht, $xml_id, $rec_p, $tag_s, @tags); + return @{$ht{$xml_id}->{SUB_NODES_OF_TYPE}->{$node_id}->{$tag_s}}; +} + +sub sub_nodes_of_tag_rec { + local($this, $root_node_id, $parent_node_id, *ht, $xml_id, $rec_p, $tag_s, @tags) = @_; + + $n_subs = $ht{$xml_id}->{N_SUBS}->{$parent_node_id} || 0; + foreach $i ((1 .. $n_subs)) { + my $node_id = join(".", $parent_node_id, $i); + my $tag = $ht{$xml_id}->{TAG}->{$node_id}; + if ($util->member($tag, @tags)) { + push(@{$ht{$xml_id}->{SUB_NODES_OF_TYPE}->{$root_node_id}->{$tag_s}}, $node_id); + $this->sub_nodes_of_tag_rec($root_node_id, $node_id, *ht, $xml_id, $rec_p, $tag_s, @tags) + if $rec_p; + } else { + $this->sub_nodes_of_tag_rec($root_node_id, $node_id, *ht, $xml_id, $rec_p, $tag_s, @tags); + } + } +} + +sub direct_sub_nodes_of_tag { + local($this, $parent_node_id, *ht, $xml_id, @tags) = @_; + + my $tag_s = join(" ", @tags); + my $n_subs = $ht{$xml_id}->{N_SUBS}->{$parent_node_id} || 0; + @{$ht{$xml_id}->{DIRECT_SUB_NODES_OF_TYPE}->{$parent_node_id}->{$tag_s}} = (); + foreach $i ((1 .. $n_subs)) { + my $node_id = join(".", $parent_node_id, $i); + my $tag = $ht{$xml_id}->{TAG}->{$node_id}; + if ($util->member($tag, @tags)) { + push(@{$ht{$xml_id}->{DIRECT_SUB_NODES_OF_TYPE}->{$parent_node_id}->{$tag_s}}, $node_id); + } + } + return @{$ht{$xml_id}->{DIRECT_SUB_NODES_OF_TYPE}->{$parent_node_id}->{$tag_s}}; +} + +sub normalize_tags { + local($this, $s, $schema) = @_; + + $s =~ s/(<\/?)ce:(sub|sup)\b/$1$2/ig; + $s =~ s/(<\/?)ce:italic\b/$1i/ig; + $s =~ s/(<\/?)italic\b/$1i/ig; + # $s =~ s/(<\/?)bold\b/$1b/ig; + return $s; +} + +sub xml_to_html { + local($this, $s, $schema, $control, $snt_id) = @_; + + $schema = "" unless defined($schema); + $control = "" unless defined($control); + $snt_id = "" unless defined($snt_id); + my $result = ""; + while ($s ne "") { + if (($pre, $close1_s, $tag, $arg_s, $close2_s, $rest) = ($s =~ /^(.*?)<(\/|)([a-zA-Z][-_:a-zA-Z0-9]*)([^<>]*?)(\s*\/|)>(.*)$/s)) { + $result .= $pre; + $tag = lc $this->normalize_tags($tag, $schema); + if (($tag eq "a") && (($href_value) = ($arg_s =~ /\bhref="(https?:[^" ]+)"/i))) { + $result .= "<$close1_s$tag href=\"$href_value\" title=\"$href_value\" target=\"_EXT\"$close2_s>"; + } elsif (($tag eq "a") && ($arg_s =~ /^\s*(\s*onclick="popup\([^()"]+\);")?\s*$/)) { + $result .= "<$close1_s$tag$arg_s$close2_s>"; + } elsif (($tag eq "span") && ($arg_s =~ /^\s*(\s*\b(style|title)="[^"]*")*\s*$/)) { + $result .= "<$close1_s$tag$arg_s$close2_s>"; + } elsif ($tag =~ /^(b|i|sub|sup)$/) { + $result .= "<$close1_s$tag$close2_s>"; + } elsif (($tag =~ /^(ce:cross-refs?|ce:inter-ref|ext-link|xref)$/) && ($control =~ /color-markup/)) { + if ($close1_s) { + $result .= "<\/span>"; + } else { + my $style = "color:#A000A0;"; + my $tag_clause = "tag: <$tag$arg_s>"; + my $text_clause = ""; + my $onclick_clause = ""; + my $url = ""; + if (($text) = ($rest =~ /^([^<>]+)<\/$tag>/)) { + $text_clause = ": $text"; + if (($tag =~ /^(ce:inter-ref|ext-link)$/) + && $util->likely_valid_url_format($text) + && ($url = $text)) { + $style = "color:#0000A0;text-decoration:underline;"; + $onclick_clause = " onclick=\"window.open('$url', '_blank');\""; + } + } + $title = ($text) ? "$text\n$tag_clause" : $tag_clause; + $title = "url-entity$text_clause\n$tag_clause" + if $url && ($tag =~ /^(ce:inter-ref|ext-link)$/); + + $title = "figure$text_clause\n$tag_clause" + if ($tag =~ /^(ce:cross-ref)$/) && ($arg_s =~ / refid="fig\S+"/); + $title = "table$text_clause\n$tag_clause" + if ($tag =~ /^(ce:cross-ref)$/) && ($arg_s =~ / refid="tbl\S+"/); + $title = "publication$text_clause\n$tag_clause" + if ($tag =~ /^(ce:cross-ref)$/) && ($arg_s =~ / refid="bib\S+"/); + $title = "publications$text_clause\n$tag_clause" + if ($tag =~ /^(ce:cross-refs)$/) && ($arg_s =~ / refid="bib\S+(\sbib\S+)+"/); + + $title = "figure$text_clause\n$tag_clause" + if ($tag =~ /^(xref)$/) && ($arg_s =~ / rid="fig\S+"/); + $title = "table$text_clause\n$tag_clause" + if ($tag =~ /^(xref)$/) && ($arg_s =~ / rid="(table|tbl)\S+"/); + $title = "figure$text_clause\n$tag_clause" + if ($tag =~ /^(xref)$/) && ($arg_s =~ / rid="app\S+"/) && ($text =~ /^(figure|fig\b)/i); + $title = "table$text_clause\n$tag_clause" + if ($tag =~ /^(xref)$/) && ($arg_s =~ / rid="app\S+"/) && ($text =~ /^table/i); + $title = "publication$text_clause\n$tag_clause" + if ($tag =~ /^(xref)$/) && ($arg_s =~ / rid="bib\S+"/); + $title = "publications$text_clause\n$tag_clause" + if ($tag =~ /^(xref)$/) && ($arg_s =~ / rid="bib\S+(\sbib\S+)+"/); + + $title = $util->guard_html($title); + $result .= ""; + } + } + $s = $rest; + } else { + $result .= $s; + $s = ""; + } + } + foreach $bc_anomality (split(/;/, $this->xml_balance_check($result, $snt_id))) { + if (($tag,$type,$count) = ($bc_anomality =~ /^(\S+):([a-z]+):(-?\d+)$/)) { + if (($count > 0) && ($tag =~ /^(b|bold|i|sub|sup)$/)) { + foreach $i ((1 .. $count)) { + my $close_tag = "<\/$tag>"; + $result .= $close_tag; + print STDERR "Adding $close_tag to line $snt_id\n" if $snt_id; + } + } + } + } + return $result; +} + +sub xml_balance_check { + local($this, $s, $snt_id) = @_; + + my %bc_ht = (); + my @bc_anomalities = (); + while ($s ne "") { + if (($close1_s, $tag, $close2_s, $rest) = ($s =~ /^.*?<(\/|)([a-zA-Z][-_:a-zA-Z0-9]*)[^<>]*?(\s*\/|)>(.*)$/s)) { + $tag = lc $tag; + # open&close tag + if ($close2_s ne "") { + $bc_ht{OPEN_TAG_COUNT}->{$tag} = $bc_ht{OPEN_TAG_COUNT}->{$tag} || 0; + # open tag + } elsif ($close1_s eq "") { + $bc_ht{OPEN_TAG_COUNT}->{$tag} = ($bc_ht{OPEN_TAG_COUNT}->{$tag} || 0) + 1; + # close tag + } else { + $bc_ht{OPEN_TAG_COUNT}->{$tag} = ($bc_ht{OPEN_TAG_COUNT}->{$tag} || 0) - 1; + $bc_ht{TAG_COUNT_UNDERFLOW}->{$tag} = ($bc_ht{TAG_COUNT_UNDERFLOW}->{$tag} || 0) + 1 + if $bc_ht{OPEN_TAG_COUNT} < 0; + } + $s = $rest; + } else { + $s = ""; + } + } + foreach $tag (sort keys %{$bc_ht{OPEN_TAG_COUNT}}) { + push(@bc_anomalities, "$tag:o:$open_tag_count") if $open_tag_count = $bc_ht{OPEN_TAG_COUNT}->{$tag}; + push(@bc_anomalities, "$tag:u:$tag_count_underflow") if $tag_count_underflow = $bc_ht{TAG_COUNT_UNDERFLOW}->{$tag}; + } + # print STDERR "xml_balance_check ($snt_id): @bc_anomalities\n" if $snt_id && ($snt_id =~ /^\d+$/) && ($snt_id <= 25); + return join(";", @bc_anomalities); +} + +sub extract_ldc_snts { + local($this, $filename, *ht, $xml_id, $doc_id, $schema) = @_; + + my @paras = (); + my @snts = (); + my $root_node_id = "1"; + my $headline = ""; + my $dateline = ""; + $this->read_xml_file($filename, *ht, $xml_id, $schema); + $doc_id = $this->find_doc_id(*ht, $xml_id, "elsxml") unless $doc_id; + + if ($headline_node_id = $this->sub_node_of_tag($root_node_id, *ht, $xml_id, 0, "headline")) { + $headline = $this->write_xml_without_tags_at_ends($headline_node_id, *ht, $xml_id, $schema); + $headline =~ s/^\s*//; + $headline =~ s/\s*$//; + push(@paras, $headline); + } + if ($dateline_node_id = $this->sub_node_of_tag($root_node_id, *ht, $xml_id, 0, "dateline")) { + $dateline = $this->write_xml_without_tags_at_ends($dateline_node_id, *ht, $xml_id, $schema); + $dateline =~ s/^\s*//; + $dateline =~ s/\s*$//; + push(@paras, $dateline); + } + if ($text_node_id = $this->sub_node_of_tag($root_node_id, *ht, $xml_id, 0, "text")) { + my $node_index = 0; + foreach $node_id ($this->sub_nodes_of_tag($text_node_id, *ht, $xml_id, 0, "p")) { + $node_index++; + my $tag = $ht{$xml_id}->{TAG}->{$node_id}; + my $text = $this->write_xml_without_tags_at_ends($node_id, *ht, $xml_id, $schema); + $text =~ s/^\s*//; + $text =~ s/\s*$//; + print STDERR "headline: $headline\nnode_index: $node_index\ntext: $text\n" if $text =~ /IFC signs deal to expand insurance to farmers/; + push(@paras, $text) + unless ($text =~ /It is a condensed version of a story that will appear in tomorrow.*s New York Times./) + || ($text =~ /(?:EDS:|Eds:)/) + || (($node_index == 1) && ($text eq $headline)); + } + } + foreach $para (@paras) { + $para = $this->html2guarded_utf8($para); + $para = $utf8->html2utf8($para); + $para = $utf8->xhtml2utf8($para); + $para =~ s/\xC2\xA0/ /g; # nbsp -> space + $para = $util->normalize_extreme_string($para); + $para = $this->normalize_tags($para, $schema); + foreach $snt (split(/\n/, $snt_segm->segment($para))) { + push(@snts, $snt); + } + } + return join("\n", @snts); +} + +sub extract_elsxml_paper_snts { + local($this, $filename, *ht, $xml_id, $doc_id, $schema) = @_; + + my @paras = (); + my @snts = (); + my $root_node_id = "1"; + $this->read_xml_file($filename, *ht, $xml_id, $schema); + $doc_id = $this->find_doc_id(*ht, $xml_id, "elsxml") unless $doc_id; + + if ($title_node_id = $this->sub_node_of_tag($root_node_id, *ht, $xml_id, 0, "ce:title")) { + my $title = $this->write_xml_without_tags_at_ends($title_node_id, *ht, $xml_id, $schema); + push(@paras, "$title ($doc_id)"); + } + if ($abstract_node_id = $this->sub_node_of_tag($root_node_id, *ht, $xml_id, 0, "ce:abstract")) { + foreach $para_node_id ($this->sub_nodes_of_tag($abstract_node_id, *ht, $xml_id, 0, "ce:simple-para")) { + my $para = $this->write_xml_without_tags_at_ends($para_node_id, *ht, $xml_id, $schema); + push(@paras, $para); + } + } + if ($sections_node_id = $this->sub_node_of_tag($root_node_id, *ht, $xml_id, 0, "ce:sections")) { + foreach $node_id ($this->sub_nodes_of_tag($sections_node_id, *ht, $xml_id, 0, "ce:section-title", "ce:para")) { + my $tag = $ht{$xml_id}->{TAG}->{$node_id}; + my $text = $this->write_xml_without_tags_at_ends($node_id, *ht, $xml_id, $schema); + push(@paras, $text); + } + } + foreach $figure_node_id ($this->sub_nodes_of_tag($root_node_id, *ht, $xml_id, 0, "ce:figure")) { + foreach $para_node_id ($this->sub_nodes_of_tag($figure_node_id, *ht, $xml_id, 0, "ce:label", "ce:simple-para")) { + my $para = $this->write_xml_without_tags_at_ends($para_node_id, *ht, $xml_id, $schema); + push(@paras, $para); + } + } + foreach $para (@paras) { + $para = $this->html2guarded_utf8($para); + $para = $utf8->html2utf8($para); + $para = $utf8->xhtml2utf8($para); + $para =~ s/\xC2\xA0/ /g; # nbsp -> space + $para = $util->normalize_extreme_string($para); + $para = $this->normalize_tags($para, $schema); + foreach $snt (split(/\n/, $snt_segm->segment($para))) { + push(@snts, $snt); + } + } + return join("\n", @snts); +} + +sub extract_nxml_paper_snts { + local($this, $filename, *ht, $xml_id, $doc_id, $schema) = @_; + + my @paras = (); + my @fig_paras = (); + my @snts = (); + my $root_node_id = "1"; + my %visited_node_ids = (); + $this->read_xml_file($filename, *ht, $xml_id, $schema); + $doc_id = $this->find_doc_id(*ht, $xml_id, "nxml") unless $doc_id; + + if ($title_node_id = $this->sub_node_of_tag($root_node_id, *ht, $xml_id, 0, "article-title")) { + my $title = $this->write_xml_without_tags_at_ends($title_node_id, *ht, $xml_id, $schema); + push(@paras, "$title ($doc_id)"); + } + foreach $abstract_node_id ($this->sub_nodes_of_tag($root_node_id, *ht, $xml_id, 0, "abstract")) { + foreach $para_node_id ($this->sub_nodes_of_tag($abstract_node_id, *ht, $xml_id, 0, "title", "p")) { + if (@sub_para_node_ids = $this->sub_nodes_of_tag($para_node_id, *ht, $xml_id, 0, "p")) { + foreach $sub_para_node_id (@sub_para_node_ids) { + my $sub_para = $this->write_xml_without_tags_at_ends($sub_para_node_id, *ht, $xml_id, $schema); + my $tag = $ht{$xml_id}->{TAG}->{$sub_para_node_id}; + if ($tag && ($tag eq "title")) { + my $sec_level = $this->level_of_tag_nesting(*ht, $xml_id, $sub_para_node_id, "sec"); + $sub_para = "$sub_para<\/sec-title>"; + } + push(@paras, $sub_para); + } + } else { + my $para = $this->write_xml_without_tags_at_ends($para_node_id, *ht, $xml_id, $schema); + my $tag = $ht{$xml_id}->{TAG}->{$para_node_id}; + if ($tag && ($tag eq "title")) { + my $sec_level = $this->level_of_tag_nesting(*ht, $xml_id, $para_node_id, "sec"); + $para = "$para<\/sec-title>"; + } + push(@paras, $para); + } + } + } + foreach $float_node_id ($this->sub_nodes_of_tag($root_node_id, *ht, $xml_id, 0, "fig", "table-wrap")) { + foreach $para_node_id ($this->sub_nodes_of_tag($float_node_id, *ht, $xml_id, 0, "label", "p")) { + next if $visited_node_ids{$para_node_id}; + $visited_node_ids{$para_node_id} = 1; + my $para = $this->write_xml_without_tags_at_ends($para_node_id, *ht, $xml_id, $schema); + next unless $para =~ /\S/; + my $tag = $ht{$xml_id}->{TAG}->{$para_node_id}; + my $parent_node_id = $ht{$xml_id}->{PARENT}->{$para_node_id} || ""; + my $parent_tag = ($parent_node_id) ? $ht{$xml_id}->{TAG}->{$parent_node_id} : ""; + if (($tag eq "label") && ($parent_tag =~ /\btable\b/i)) { + $para = "