Skip to content

Commit

Permalink
chg: simplify validation of choices and tag names
Browse files Browse the repository at this point in the history
- xls2json.py:
  - move choices checks to new validator module
  - include row reference in all errors/warnings. To have a consistent
    error message structure, each duplicate choice gets a copy of the
    message (in one error) rather than being smushed into one message.
- xlsparseutils.py
  - replace separate regex for xml tags with expression parser and use
    that instead, for consistent parsing rules and to use cache
  - move the remaining sheet misspellings check to new validator module
- expression.py
  - fix issue where a name containing "or" or "and" was parsed as an
    operator rather than a name. These tokens are only valid when
    surrounded by spaces so the regex is updated accordingly.
  - add tests for positive and negative match cases for tag names.
  • Loading branch information
lindsay-stevens committed Nov 2, 2024
1 parent 88b0328 commit b65e727
Show file tree
Hide file tree
Showing 13 changed files with 189 additions and 110 deletions.
7 changes: 4 additions & 3 deletions pyxform/entities/entities_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from pyxform import constants as const
from pyxform.aliases import yes_no
from pyxform.errors import PyXFormError
from pyxform.xlsparseutils import find_sheet_misspellings, is_valid_xml_tag
from pyxform.parsing.expression import is_xml_tag
from pyxform.validators.pyxform.sheet_misspellings import find_sheet_misspellings

EC = const.EntityColumns

Expand Down Expand Up @@ -73,7 +74,7 @@ def get_validated_dataset_name(entity):
f"Invalid entity list name: '{dataset}'. Names may not include periods."
)

if not is_valid_xml_tag(dataset):
if not is_xml_tag(dataset):
if isinstance(dataset, bytes):
dataset = dataset.decode("utf-8")

Expand Down Expand Up @@ -118,7 +119,7 @@ def validate_entity_saveto(
f"{error_start} the entity property name '{save_to}' starts with reserved prefix {const.ENTITIES_RESERVED_PREFIX}."
)

if not is_valid_xml_tag(save_to):
if not is_xml_tag(save_to):
if isinstance(save_to, bytes):
save_to = save_to.decode("utf-8")

Expand Down
18 changes: 16 additions & 2 deletions pyxform/parsing/expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ def get_expression_lexer() -> re.Scanner:
"TIME": time_regex,
"NUMBER": r"-?\d+\.\d*|-?\.\d+|-?\d+",
# https://www.w3.org/TR/1999/REC-xpath-19991116/#exprlex
"OPS_MATH": r"[\*\+\-]|mod|div",
"OPS_MATH": r"[\*\+\-]| mod | div ",
"OPS_COMP": r"\=|\!\=|\<|\>|\<=|>=",
"OPS_BOOL": r"and|or",
"OPS_BOOL": r" and | or ",
"OPS_UNION": r"\|",
"OPEN_PAREN": r"\(",
"CLOSE_PAREN": r"\)",
Expand Down Expand Up @@ -107,3 +107,17 @@ def is_single_token_expression(expression: str, token_types: Iterable[str]) -> b
return True
else:
return False


def is_pyxform_reference(value: str) -> bool:
"""
Does the input string contain only a valid Pyxform reference? e.g. ${my_question}
"""
return is_single_token_expression(expression=value, token_types=("PYXFORM_REF",))


def is_xml_tag(value: str) -> bool:
"""
Does the input string contain only a valid XML tag / element name?
"""
return is_single_token_expression(expression=value, token_types=("NAME",))
4 changes: 2 additions & 2 deletions pyxform/survey_element.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from pyxform import aliases as alias
from pyxform import constants as const
from pyxform.errors import PyXFormError
from pyxform.parsing.expression import is_xml_tag
from pyxform.question_type_dictionary import QUESTION_TYPE_DICT
from pyxform.utils import (
BRACKETED_TAG_REGEX,
Expand All @@ -19,7 +20,6 @@
node,
)
from pyxform.xls2json import print_pyobj_to_json
from pyxform.xlsparseutils import is_valid_xml_tag

if TYPE_CHECKING:
from pyxform.utils import DetachableElement
Expand Down Expand Up @@ -140,7 +140,7 @@ def add_children(self, children):
SUPPORTED_MEDIA = ("image", "big-image", "audio", "video")

def validate(self):
if not is_valid_xml_tag(self.name):
if not is_xml_tag(self.name):
invalid_char = re.search(INVALID_XFORM_TAG_REGEXP, self.name)
raise PyXFormError(
f"The name '{self.name}' contains an invalid character '{invalid_char.group(0)}'. Names {const.XML_IDENTIFIER_ERROR_MESSAGE}"
Expand Down
59 changes: 59 additions & 0 deletions pyxform/validators/pyxform/choices.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from pyxform import constants
from pyxform.errors import PyXFormError

INVALID_NAME = (
"[row : {row}] On the 'choices' sheet, the 'name' value is invalid. "
"Choices must have a name. "
"Learn more: https://xlsform.org/en/#setting-up-your-worksheets"
)
INVALID_LABEL = (
"[row : {row}] On the 'choices' sheet, the 'label' value is invalid. "
"Choices should have a label. "
"Learn more: https://xlsform.org/en/#setting-up-your-worksheets"
)
INVALID_HEADER = (
"[row : 1] On the 'choices' sheet, the '{column}' value is invalid. "
"Column headers must not be empty and must not contain spaces. "
"Learn more: https://xlsform.org/en/#setting-up-your-worksheets"
)
INVALID_DUPLICATE = (
"[row : {row}] On the 'choices' sheet, the 'name' value is invalid. "
"Choice names must be unique for each choice list. "
"If this is intentional, use the setting 'allow_choice_duplicates'. "
"Learn more: https://xlsform.org/#choice-names."
)


def validate_headers(
headers: tuple[tuple[str, ...], ...], warnings: list[str]
) -> list[str]:
def check():
for header in headers:
header = header[0]
if header != constants.LIST_NAME_S and (" " in header or header == ""):
warnings.append(INVALID_HEADER.format(column=header))
yield header

return list(check())


def validate_choices(
options: list[dict], warnings: list[str], allow_duplicates: bool = False
) -> None:
seen_options = set()
duplicate_errors = []
for option in options:
if "name" not in option:
raise PyXFormError(INVALID_NAME.format(row=option["__row"]))
elif "label" not in option:
warnings.append(INVALID_LABEL.format(row=option["__row"]))

if not allow_duplicates:
name = option["name"]
if name in seen_options:
duplicate_errors.append(INVALID_DUPLICATE.format(row=option["__row"]))
else:
seen_options.add(name)

if 0 < len(duplicate_errors):
raise PyXFormError("\n".join(duplicate_errors))
6 changes: 2 additions & 4 deletions pyxform/validators/pyxform/question_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""

from pyxform.errors import PyXFormError
from pyxform.parsing.expression import is_single_token_expression
from pyxform.parsing.expression import is_pyxform_reference
from pyxform.utils import PYXFORM_REFERENCE_REGEX

BACKGROUND_GEOPOINT_CALCULATION = "[row : {r}] For 'background-geopoint' questions, the 'calculation' column must be empty."
Expand All @@ -25,9 +25,7 @@ def validate_background_geopoint_calculation(row: dict, row_num: int) -> bool:

def validate_background_geopoint_trigger(row: dict, row_num: int) -> bool:
"""A background-geopoint must have a trigger."""
if not row.get("trigger", False) or not is_single_token_expression(
expression=row["trigger"], token_types=["PYXFORM_REF"]
):
if not row.get("trigger", False) or not is_pyxform_reference(value=row["trigger"]):
raise PyXFormError(TRIGGER_INVALID.format(r=row_num, t=row["type"]))
return True

Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,8 @@
import re
from collections.abc import KeysView

from pyxform import constants
from pyxform.utils import levenshtein_distance

# http://www.w3.org/TR/REC-xml/
TAG_START_CHAR = r"[a-zA-Z:_]"
TAG_CHAR = r"[a-zA-Z:_0-9\-.]"
XFORM_TAG_REGEXP = re.compile(rf"^{TAG_START_CHAR}{TAG_CHAR}*$")


def find_sheet_misspellings(key: str, keys: "KeysView") -> "str | None":
"""
Expand Down Expand Up @@ -36,10 +30,3 @@ def find_sheet_misspellings(key: str, keys: "KeysView") -> "str | None":
return msg
else:
return None


def is_valid_xml_tag(tag):
"""
Use a regex to see if there are any invalid characters (i.e. spaces).
"""
return re.search(XFORM_TAG_REGEXP, tag)
108 changes: 35 additions & 73 deletions pyxform/xls2json.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import os
import re
import sys
from collections import Counter
from typing import IO, Any

from pyxform import aliases, constants
Expand All @@ -21,15 +20,16 @@
validate_entity_saveto,
)
from pyxform.errors import PyXFormError
from pyxform.parsing.expression import is_single_token_expression
from pyxform.parsing.expression import is_pyxform_reference, is_xml_tag
from pyxform.utils import PYXFORM_REFERENCE_REGEX, coalesce, default_is_dynamic
from pyxform.validators.pyxform import choices as vc
from pyxform.validators.pyxform import parameters_generic, select_from_file
from pyxform.validators.pyxform import question_types as qt
from pyxform.validators.pyxform.android_package_name import validate_android_package_name
from pyxform.validators.pyxform.pyxform_reference import validate_pyxform_reference_syntax
from pyxform.validators.pyxform.sheet_misspellings import find_sheet_misspellings
from pyxform.validators.pyxform.translations_checks import SheetTranslations
from pyxform.xls2json_backends import csv_to_dict, xls_to_dict, xlsx_to_dict
from pyxform.xlsparseutils import find_sheet_misspellings, is_valid_xml_tag

SMART_QUOTES = {"\u2018": "'", "\u2019": "'", "\u201c": '"', "\u201d": '"'}
RE_SMART_QUOTES = re.compile(r"|".join(re.escape(old) for old in SMART_QUOTES))
Expand Down Expand Up @@ -175,7 +175,12 @@ def dealias_types(dict_array):
return dict_array


def clean_text_values(sheet_name: str, data: list[dict], strip_whitespace: bool = False):
def clean_text_values(
sheet_name: str,
data: list[dict],
strip_whitespace: bool = False,
add_row_number: bool = False,
) -> list[dict]:
"""
Go though the dict array and strips all text values.
Also replaces multiple spaces with single spaces.
Expand All @@ -192,6 +197,8 @@ def clean_text_values(sheet_name: str, data: list[dict], strip_whitespace: bool
validate_pyxform_reference_syntax(
value=value, sheet_name=sheet_name, row_number=row_number, key=key
)
if add_row_number:
row["__row"] = row_number
return data


Expand Down Expand Up @@ -513,7 +520,11 @@ def workbook_to_json(

# ########## Choices sheet ##########
choices_sheet = workbook_dict.get(constants.CHOICES, [])
choices_sheet = clean_text_values(sheet_name=constants.CHOICES, data=choices_sheet)
choices_sheet = clean_text_values(
sheet_name=constants.CHOICES,
data=choices_sheet,
add_row_number=True,
)
choices_sheet = dealias_and_group_headers(
dict_array=choices_sheet,
header_aliases=aliases.list_header,
Expand All @@ -523,73 +534,27 @@ def workbook_to_json(
choices = group_dictionaries_by_key(
list_of_dicts=choices_sheet.data, key=constants.LIST_NAME_S
)
if 0 < len(choices):
json_dict[constants.CHOICES] = choices
# To combine the warning into one message, the check for missing choices translation
# columns is run with Survey sheet below.

# Make sure all the options have the required properties:
warnedabout = set()
for list_name, options in choices.items():
# Warn and remove invalid headers in case the form uses headers for notes.
invalid_headers = vc.validate_headers(choices_sheet.headers, warnings)
allow_duplicates = aliases.yes_no.get(
settings.get("allow_choice_duplicates", False), False
)
for options in choices.values():
vc.validate_choices(
options=options,
warnings=warnings,
allow_duplicates=allow_duplicates,
)
for option in options:
if "name" not in option:
info = "[list_name : " + list_name + "]"
raise PyXFormError(
"On the choices sheet there is a option with no name. " + info
)
if "label" not in option:
info = "[list_name : " + list_name + "]"
warnings.append(
"On the choices sheet there is a option with no label. " + info
)
# chrislrobert's fix for a cryptic error message:
# see: https://code.google.com/p/opendatakit/issues/detail?id=833&start=200
option_keys = list(option.keys())
for headername in option_keys:
# Using warnings and removing the bad columns
# instead of throwing errors because some forms
# use choices column headers for notes.
if " " in headername:
if headername not in warnedabout:
warnedabout.add(headername)
warnings.append(
"On the choices sheet there is "
+ 'a column ("'
+ headername
+ '") with an illegal header. '
+ "Headers cannot include spaces."
)
del option[headername]
elif headername == "":
warnings.append(
"On the choices sheet there is a value"
+ " in a column with no header."
)
del option[headername]
list_name_choices = [option.get("name") for option in options]
if len(list_name_choices) != len(set(list_name_choices)):
duplicate_setting = settings.get("allow_choice_duplicates")
for v in Counter(list_name_choices).values():
if v > 1:
if not duplicate_setting or duplicate_setting.capitalize() != "Yes":
choice_duplicates = [
item
for item, count in Counter(list_name_choices).items()
if count > 1
]

if choice_duplicates:
raise PyXFormError(
"The name column for the '{}' choice list contains these duplicates: {}. Duplicate names "
"will be impossible to identify in analysis unless a previous value in a cascading "
"select differentiates them. If this is intentional, you can set the "
"allow_choice_duplicates setting to 'yes'. Learn more: https://xlsform.org/#choice-names.".format(
list_name,
", ".join(
[f"'{dupe}'" for dupe in choice_duplicates]
),
)
)
for invalid_header in invalid_headers:
option.pop(invalid_header, None)
del option["__row"]

if 0 < len(choices):
json_dict[constants.CHOICES] = choices

# ########## Entities sheet ###########
entities_sheet = workbook_dict.get(constants.ENTITIES, [])
Expand Down Expand Up @@ -945,7 +910,7 @@ def workbook_to_json(
ROW_FORMAT_STRING % row_number + " Question or group with no name."
)
question_name = str(row[constants.NAME])
if not is_valid_xml_tag(question_name):
if not is_xml_tag(question_name):
if isinstance(question_name, bytes):
question_name = question_name.decode("utf-8")

Expand Down Expand Up @@ -1022,10 +987,7 @@ def workbook_to_json(
repeat_count_expression = new_json_dict.get("control", {}).get("jr:count")
if repeat_count_expression:
# Simple expressions don't require a new node, they can reference directly.
simple_expression = is_single_token_expression(
expression=repeat_count_expression, token_types=["PYXFORM_REF"]
)
if not simple_expression:
if not is_pyxform_reference(value=repeat_count_expression):
generated_node_name = new_json_dict["name"] + "_count"
parent_children_array.append(
{
Expand Down
Empty file added tests/parsing/__init__.py
Empty file.
Loading

0 comments on commit b65e727

Please sign in to comment.