Skip to content

Commit

Permalink
Add MF2 test generator
Browse files Browse the repository at this point in the history
  • Loading branch information
mradbourne authored Feb 27, 2024
1 parent e30e526 commit b11be59
Show file tree
Hide file tree
Showing 5 changed files with 175 additions and 19 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

.pylintrc
.idea
.devcontainer

executors/rust/target/
**/__pycache__/
Expand Down
14 changes: 14 additions & 0 deletions setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,21 @@ fi
if [[ ! -d gh-cache ]]
then
mkdir -p gh-cache
fi

# ensure that the Python `enum` module is installed
# Github Actions uses Python 3.10 as of Feb 2024
python3 -c 'import pkgutil
if pkgutil.find_loader("enum"):
print("The enum module is already installed")
else:
print("The enum module is not installed yet")
sys.exit(1)
'
error_code=$?
if [[ $error_code -ne 0 ]]
then
sudo apt-get install python3-enum34
fi


Expand Down
21 changes: 21 additions & 0 deletions testgen/icu75/message_fmt2/syntax/literal-text.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"scenario": "Literal text",
"description": "Text placed directly into the pattern",
"defaultTestProperties": {
"testSubtype": "syntax"
},
"tests": [
{
"description": "Includes basic literals",
"locale": "en-US",
"pattern": "hello world",
"verify": "hello world"
},
{
"description": "Includes unquoted literals",
"locale": "en-US",
"pattern": "hello {world}",
"verify": "hello world"
}
]
}
33 changes: 33 additions & 0 deletions testgen/icu75/message_fmt2/syntax/whitespace.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"scenario": "Whitespace",
"description": "Leading and trailing space characters",
"defaultTestProperties": {
"testSubtype": "syntax"
},
"tests": [
{
"description": "Preserves leading space on basic literals",
"locale": "en-US",
"pattern": " hello world",
"verify": " hello world"
},
{
"description": "Removes leading space on unquoted literals",
"locale": "en-US",
"pattern": "hello { world}",
"verify": "hello world"
},
{
"description": "Preserves trailing space on basic literals",
"locale": "en-US",
"pattern": "hello world ",
"verify": "hello world "
},
{
"description": "Removes trailing space on unquoted literals",
"locale": "en-US",
"pattern": "hello {world }",
"verify": "hello world"
}
]
}
125 changes: 106 additions & 19 deletions testgen/testdata_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@
import os
import re
import requests
import sys
from enum import Enum
import glob
from pathlib import Path
from jsonschema import Draft202012Validator, ValidationError

reblankline = re.compile('^\s*$')

Expand All @@ -22,6 +25,17 @@
NUMBERFORMAT_LOCALE_INDICES = [3, 7, 11]


class TestType(str, Enum):
NUMBER_FMT = 'number_fmt'
COLLATION_SHORT = 'collation_short'
LANG_NAMES = 'lang_names'
LIKELY_SUBTAGS = 'likely_subtags'
MESSAGE_FMT2 = 'message_fmt2'

def __str__(self):
return self.value


class generateData():
def __init__(self, icu_version):
self.icu_version = icu_version
Expand All @@ -34,10 +48,10 @@ def setVersion(self, selected_version):
self.icu_version = selected_version

def saveJsonFile(self, filename, data, indent=None):
output_path = os.path.join(self.icu_version, filename)
output_file = open(output_path, 'w', encoding='UTF-8')
json.dump(data, output_file, indent=indent)
output_file.close()
output_path = Path(os.path.dirname(__file__), '..', 'DDT_DATA', 'testData', self.icu_version, filename)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='UTF-8') as output_file:
json.dump(data, output_file, indent=indent)

def getTestDataFromGitHub(self, datafile_name, version):
# Path for fetching test data from ICU repository
Expand Down Expand Up @@ -318,20 +332,80 @@ def processLikelySubtagsData(self):
logging.info('Likely Subtags Test (%s): %d lines processed', self.icu_version, count)
return

def processMessageFmt2TestData(self):
json_test = {'test_type': str(TestType.MESSAGE_FMT2), 'tests': []}
json_verify = {'test_type': str(TestType.MESSAGE_FMT2), 'verifications': []}

src_dir = Path(os.path.dirname(__file__), self.icu_version, TestType.MESSAGE_FMT2)
src_file_paths = glob.glob(os.path.join(src_dir, '**', '*.json'), recursive=True)
src_file_paths.sort()

json_schema_path = Path(os.path.dirname(__file__), '..', 'schema', TestType.MESSAGE_FMT2, 'testgen_schema.json')
json_schema_validator = Draft202012Validator(json.load(open(json_schema_path)))

test_count = 0
test_list = []
verify_list = []

for test_file_path in src_file_paths:
src_data = readFile(test_file_path, filetype='json')
if src_data is None:
logging.error('Problem reading JSON. Omitting file %s', test_file_path)
continue

defaults = src_data.get('defaultTestProperties')

try:
json_schema_validator.validate(src_data)
except ValidationError as err:
logging.error('Problem validating JSON: %s', test_file_path)
logging.error(err)

for src_test in src_data['tests']:
test_count += 1
label = f'{test_count - 1:05d}'
description = f'{src_data["scenario"]}: {src_test["description"]}'
args = src_test.get('args') or (defaults.get('args') if defaults else None)

try:
test_list.append({
'label': label,
'test_description': description,
'test_subtype': src_test.get('testSubtype') or defaults['testSubtype'],
'locale': src_test.get('locale') or defaults['locale'],
'pattern': src_test.get('pattern') or defaults['pattern'],
**({'args': args} if args else {})
})
verify_list.append({
'label': label,
'verify': src_test['verify']
})
except KeyError as err:
logging.error('Missing value for %s in %s', err, test_file_path)
logging.error('Omitting test %s (%s)', label, description)

json_test["tests"] = self.sample_tests(test_list)
json_verify["verifications"] = self.sample_tests(verify_list)

self.saveJsonFile(f'{TestType.MESSAGE_FMT2}_test.json', json_test, 2)
self.saveJsonFile(f'{TestType.MESSAGE_FMT2}_verify.json', json_verify, 2)

logging.info('MessageFormat2 Test (%s): %d tests processed', self.icu_version, test_count)


# Utility functions
def computeMaxDigitsForCount(count):
return math.ceil(math.log10(count + 1))


def readFile(filename, version=''):
def readFile(filename, version='', filetype='txt'):
# If version is provided, it refers to a subdirectory containing the test source
path = filename
if version:
path = os.path.join(version, filename)
try:
with open(path, 'r', encoding='utf-8') as testdata:
return testdata.read()
return json.load(testdata) if filetype == 'json' else testdata.read()
except BaseException as err:
logging.warning('** READ: Error = %s', err)
return None
Expand Down Expand Up @@ -1013,18 +1087,20 @@ def insertNumberFmtDescr(tests_obj, verify_obj):
def setupArgs():
parser = argparse.ArgumentParser(prog='testdata_gen')
parser.add_argument('--icu_versions', nargs='*', default=[])
all_test_types = [t.value for t in TestType]
parser.add_argument('--test_types', nargs='*', choices=all_test_types, default=all_test_types)
# -1 is no limit
parser.add_argument('--run_limit', nargs='?', type=int, default=-1)
new_args = parser.parse_args()
return new_args


def generate_versioned_data_parallel(icu_versions, args):
def generate_versioned_data_parallel(args):
num_processors = mp.cpu_count()
logging.info('Test data generation: %s processors for %s plans' , num_processors, len(icu_versions))
logging.info('Test data generation: %s processors for %s plans' , num_processors, len(args.icu_versions))

version_data = []
for icu_version in icu_versions:
for icu_version in args.icu_versions:
version_data.append(
{
'icu_version': icu_version,
Expand All @@ -1038,6 +1114,7 @@ def generate_versioned_data_parallel(icu_versions, args):

return result


def generate_versioned_data(version_info):
new_args = version_info['args']
icu_version = version_info['icu_version']
Expand All @@ -1047,28 +1124,38 @@ def generate_versioned_data(version_info):
logging.info('Generating .json files for data driven testing. ICU_VERSION requested = %s',
icu_version)

data_generator.processNumberFmtTestData()
if len(new_args.test_types) < len(TestType):
logging.info('(Only generating %s)', ', '.join(new_args.test_types))

if TestType.NUMBER_FMT in new_args.test_types:
data_generator.processNumberFmtTestData()

if TestType.COLLATION_SHORT in new_args.test_types:
# This is slow
data_generator.processCollationTestData()

# This is slow
data_generator.processCollationTestData()
if TestType.LIKELY_SUBTAGS in new_args.test_types:
data_generator.processLikelySubtagsData()

data_generator.processLikelySubtagsData()
if TestType.LANG_NAMES in new_args.test_types:
# This is slow
data_generator.processLangNameTestData()

# This is slow
data_generator.processLangNameTestData()
if TestType.MESSAGE_FMT2 in new_args.test_types:
data_generator.processMessageFmt2TestData()

logging.info('++++ Data generation for %s is complete.', icu_version)


def main(args):
def main():
new_args = setupArgs()

logger = logging.Logger("TEST_GENERATE LOGGER")
logger.setLevel(logging.INFO)

# Generate version data in parallel if possible
generate_versioned_data_parallel(new_args.icu_versions, new_args)
generate_versioned_data_parallel(new_args)


if __name__ == '__main__':
main(sys.argv)
main()

0 comments on commit b11be59

Please sign in to comment.