chg: simplify validation of choices and tag names

- xls2json.py: - move choices checks to new validator module - include row reference in all errors/warnings. To have a consistent error message structure, each duplicate choice gets a copy of the message (in one error) rather than being smushed into one message. - xlsparseutils.py - replace separate regex for xml tags with expression parser and use that instead, for consistent parsing rules and to use cache - move the remaining sheet misspellings check to new validator module - expression.py - fix issue where a name containing "or" or "and" was parsed as an operator rather than a name. These tokens are only valid when surrounded by spaces so the regex is updated accordingly. - add tests for positive and negative match cases for tag names.
XLSForm · Nov 2, 2024 · b65e727 · b65e727
1 parent 88b0328
commit b65e727
Show file tree

Hide file tree

Showing 13 changed files with 189 additions and 110 deletions.
diff --git a/pyxform/entities/entities_parsing.py b/pyxform/entities/entities_parsing.py
@@ -3,7 +3,8 @@
 from pyxform import constants as const
 from pyxform.aliases import yes_no
 from pyxform.errors import PyXFormError
-from pyxform.xlsparseutils import find_sheet_misspellings, is_valid_xml_tag
+from pyxform.parsing.expression import is_xml_tag
+from pyxform.validators.pyxform.sheet_misspellings import find_sheet_misspellings
 
 EC = const.EntityColumns
 
@@ -73,7 +74,7 @@ def get_validated_dataset_name(entity):
             f"Invalid entity list name: '{dataset}'. Names may not include periods."
         )
 
-    if not is_valid_xml_tag(dataset):
+    if not is_xml_tag(dataset):
         if isinstance(dataset, bytes):
             dataset = dataset.decode("utf-8")
 
@@ -118,7 +119,7 @@ def validate_entity_saveto(
             f"{error_start} the entity property name '{save_to}' starts with reserved prefix {const.ENTITIES_RESERVED_PREFIX}."
         )
 
-    if not is_valid_xml_tag(save_to):
+    if not is_xml_tag(save_to):
         if isinstance(save_to, bytes):
             save_to = save_to.decode("utf-8")
 

diff --git a/pyxform/parsing/expression.py b/pyxform/parsing/expression.py
@@ -37,9 +37,9 @@ def get_expression_lexer() -> re.Scanner:
         "TIME": time_regex,
         "NUMBER": r"-?\d+\.\d*|-?\.\d+|-?\d+",
         # https://www.w3.org/TR/1999/REC-xpath-19991116/#exprlex
-        "OPS_MATH": r"[\*\+\-]|mod|div",
+        "OPS_MATH": r"[\*\+\-]| mod | div ",
         "OPS_COMP": r"\=|\!\=|\<|\>|\<=|>=",
-        "OPS_BOOL": r"and|or",
+        "OPS_BOOL": r" and | or ",
         "OPS_UNION": r"\|",
         "OPEN_PAREN": r"\(",
         "CLOSE_PAREN": r"\)",
@@ -107,3 +107,17 @@ def is_single_token_expression(expression: str, token_types: Iterable[str]) -> b
         return True
     else:
         return False
+
+
+def is_pyxform_reference(value: str) -> bool:
+    """
+    Does the input string contain only a valid Pyxform reference? e.g. ${my_question}
+    """
+    return is_single_token_expression(expression=value, token_types=("PYXFORM_REF",))
+
+
+def is_xml_tag(value: str) -> bool:
+    """
+    Does the input string contain only a valid XML tag / element name?
+    """
+    return is_single_token_expression(expression=value, token_types=("NAME",))
diff --git a/pyxform/survey_element.py b/pyxform/survey_element.py
@@ -11,6 +11,7 @@
 from pyxform import aliases as alias
 from pyxform import constants as const
 from pyxform.errors import PyXFormError
+from pyxform.parsing.expression import is_xml_tag
 from pyxform.question_type_dictionary import QUESTION_TYPE_DICT
 from pyxform.utils import (
     BRACKETED_TAG_REGEX,
@@ -19,7 +20,6 @@
     node,
 )
 from pyxform.xls2json import print_pyobj_to_json
-from pyxform.xlsparseutils import is_valid_xml_tag
 
 if TYPE_CHECKING:
     from pyxform.utils import DetachableElement
@@ -140,7 +140,7 @@ def add_children(self, children):
     SUPPORTED_MEDIA = ("image", "big-image", "audio", "video")
 
     def validate(self):
-        if not is_valid_xml_tag(self.name):
+        if not is_xml_tag(self.name):
             invalid_char = re.search(INVALID_XFORM_TAG_REGEXP, self.name)
             raise PyXFormError(
                 f"The name '{self.name}' contains an invalid character '{invalid_char.group(0)}'. Names {const.XML_IDENTIFIER_ERROR_MESSAGE}"

diff --git a/pyxform/validators/pyxform/choices.py b/pyxform/validators/pyxform/choices.py
@@ -0,0 +1,59 @@
+from pyxform import constants
+from pyxform.errors import PyXFormError
+
+INVALID_NAME = (
+    "[row : {row}] On the 'choices' sheet, the 'name' value is invalid. "
+    "Choices must have a name. "
+    "Learn more: https://xlsform.org/en/#setting-up-your-worksheets"
+)
+INVALID_LABEL = (
+    "[row : {row}] On the 'choices' sheet, the 'label' value is invalid. "
+    "Choices should have a label. "
+    "Learn more: https://xlsform.org/en/#setting-up-your-worksheets"
+)
+INVALID_HEADER = (
+    "[row : 1] On the 'choices' sheet, the '{column}' value is invalid. "
+    "Column headers must not be empty and must not contain spaces. "
+    "Learn more: https://xlsform.org/en/#setting-up-your-worksheets"
+)
+INVALID_DUPLICATE = (
+    "[row : {row}] On the 'choices' sheet, the 'name' value is invalid. "
+    "Choice names must be unique for each choice list. "
+    "If this is intentional, use the setting 'allow_choice_duplicates'. "
+    "Learn more: https://xlsform.org/#choice-names."
+)
+
+
+def validate_headers(
+    headers: tuple[tuple[str, ...], ...], warnings: list[str]
+) -> list[str]:
+    def check():
+        for header in headers:
+            header = header[0]
+            if header != constants.LIST_NAME_S and (" " in header or header == ""):
+                warnings.append(INVALID_HEADER.format(column=header))
+                yield header
+
+    return list(check())
+
+
+def validate_choices(
+    options: list[dict], warnings: list[str], allow_duplicates: bool = False
+) -> None:
+    seen_options = set()
+    duplicate_errors = []
+    for option in options:
+        if "name" not in option:
+            raise PyXFormError(INVALID_NAME.format(row=option["__row"]))
+        elif "label" not in option:
+            warnings.append(INVALID_LABEL.format(row=option["__row"]))
+
+        if not allow_duplicates:
+            name = option["name"]
+            if name in seen_options:
+                duplicate_errors.append(INVALID_DUPLICATE.format(row=option["__row"]))
+            else:
+                seen_options.add(name)
+
+    if 0 < len(duplicate_errors):
+        raise PyXFormError("\n".join(duplicate_errors))
diff --git a/pyxform/validators/pyxform/question_types.py b/pyxform/validators/pyxform/question_types.py
@@ -3,7 +3,7 @@
 """
 
 from pyxform.errors import PyXFormError
-from pyxform.parsing.expression import is_single_token_expression
+from pyxform.parsing.expression import is_pyxform_reference
 from pyxform.utils import PYXFORM_REFERENCE_REGEX
 
 BACKGROUND_GEOPOINT_CALCULATION = "[row : {r}] For 'background-geopoint' questions, the 'calculation' column must be empty."
@@ -25,9 +25,7 @@ def validate_background_geopoint_calculation(row: dict, row_num: int) -> bool:
 
 def validate_background_geopoint_trigger(row: dict, row_num: int) -> bool:
     """A background-geopoint must have a trigger."""
-    if not row.get("trigger", False) or not is_single_token_expression(
-        expression=row["trigger"], token_types=["PYXFORM_REF"]
-    ):
+    if not row.get("trigger", False) or not is_pyxform_reference(value=row["trigger"]):
         raise PyXFormError(TRIGGER_INVALID.format(r=row_num, t=row["type"]))
     return True
 

diff --git a/pyxform/xlsparseutils.py → .../validators/pyxform/sheet_misspellings.py b/pyxform/xlsparseutils.py → .../validators/pyxform/sheet_misspellings.py
@@ -1,14 +1,8 @@
-import re
 from collections.abc import KeysView
 
 from pyxform import constants
 from pyxform.utils import levenshtein_distance
 
-# http://www.w3.org/TR/REC-xml/
-TAG_START_CHAR = r"[a-zA-Z:_]"
-TAG_CHAR = r"[a-zA-Z:_0-9\-.]"
-XFORM_TAG_REGEXP = re.compile(rf"^{TAG_START_CHAR}{TAG_CHAR}*$")
-
 
 def find_sheet_misspellings(key: str, keys: "KeysView") -> "str | None":
     """
@@ -36,10 +30,3 @@ def find_sheet_misspellings(key: str, keys: "KeysView") -> "str | None":
         return msg
     else:
         return None
-
-
-def is_valid_xml_tag(tag):
-    """
-    Use a regex to see if there are any invalid characters (i.e. spaces).
-    """
-    return re.search(XFORM_TAG_REGEXP, tag)
diff --git a/pyxform/xls2json.py b/pyxform/xls2json.py
@@ -6,7 +6,6 @@
 import os
 import re
 import sys
-from collections import Counter
 from typing import IO, Any
 
 from pyxform import aliases, constants
@@ -21,15 +20,16 @@
     validate_entity_saveto,
 )
 from pyxform.errors import PyXFormError
-from pyxform.parsing.expression import is_single_token_expression
+from pyxform.parsing.expression import is_pyxform_reference, is_xml_tag
 from pyxform.utils import PYXFORM_REFERENCE_REGEX, coalesce, default_is_dynamic
+from pyxform.validators.pyxform import choices as vc
 from pyxform.validators.pyxform import parameters_generic, select_from_file
 from pyxform.validators.pyxform import question_types as qt
 from pyxform.validators.pyxform.android_package_name import validate_android_package_name
 from pyxform.validators.pyxform.pyxform_reference import validate_pyxform_reference_syntax
+from pyxform.validators.pyxform.sheet_misspellings import find_sheet_misspellings
 from pyxform.validators.pyxform.translations_checks import SheetTranslations
 from pyxform.xls2json_backends import csv_to_dict, xls_to_dict, xlsx_to_dict
-from pyxform.xlsparseutils import find_sheet_misspellings, is_valid_xml_tag
 
 SMART_QUOTES = {"\u2018": "'", "\u2019": "'", "\u201c": '"', "\u201d": '"'}
 RE_SMART_QUOTES = re.compile(r"|".join(re.escape(old) for old in SMART_QUOTES))
@@ -175,7 +175,12 @@ def dealias_types(dict_array):
     return dict_array
 
 
-def clean_text_values(sheet_name: str, data: list[dict], strip_whitespace: bool = False):
+def clean_text_values(
+    sheet_name: str,
+    data: list[dict],
+    strip_whitespace: bool = False,
+    add_row_number: bool = False,
+) -> list[dict]:
     """
     Go though the dict array and strips all text values.
     Also replaces multiple spaces with single spaces.
@@ -192,6 +197,8 @@ def clean_text_values(sheet_name: str, data: list[dict], strip_whitespace: bool
                 validate_pyxform_reference_syntax(
                     value=value, sheet_name=sheet_name, row_number=row_number, key=key
                 )
+        if add_row_number:
+            row["__row"] = row_number
     return data
 
 
@@ -513,7 +520,11 @@ def workbook_to_json(
 
     # ########## Choices sheet ##########
     choices_sheet = workbook_dict.get(constants.CHOICES, [])
-    choices_sheet = clean_text_values(sheet_name=constants.CHOICES, data=choices_sheet)
+    choices_sheet = clean_text_values(
+        sheet_name=constants.CHOICES,
+        data=choices_sheet,
+        add_row_number=True,
+    )
     choices_sheet = dealias_and_group_headers(
         dict_array=choices_sheet,
         header_aliases=aliases.list_header,
@@ -523,73 +534,27 @@ def workbook_to_json(
     choices = group_dictionaries_by_key(
         list_of_dicts=choices_sheet.data, key=constants.LIST_NAME_S
     )
-    if 0 < len(choices):
-        json_dict[constants.CHOICES] = choices
     # To combine the warning into one message, the check for missing choices translation
     # columns is run with Survey sheet below.
 
-    # Make sure all the options have the required properties:
-    warnedabout = set()
-    for list_name, options in choices.items():
+    # Warn and remove invalid headers in case the form uses headers for notes.
+    invalid_headers = vc.validate_headers(choices_sheet.headers, warnings)
+    allow_duplicates = aliases.yes_no.get(
+        settings.get("allow_choice_duplicates", False), False
+    )
+    for options in choices.values():
+        vc.validate_choices(
+            options=options,
+            warnings=warnings,
+            allow_duplicates=allow_duplicates,
+        )
         for option in options:
-            if "name" not in option:
-                info = "[list_name : " + list_name + "]"
-                raise PyXFormError(
-                    "On the choices sheet there is a option with no name. " + info
-                )
-            if "label" not in option:
-                info = "[list_name : " + list_name + "]"
-                warnings.append(
-                    "On the choices sheet there is a option with no label. " + info
-                )
-            # chrislrobert's fix for a cryptic error message:
-            # see: https://code.google.com/p/opendatakit/issues/detail?id=833&start=200
-            option_keys = list(option.keys())
-            for headername in option_keys:
-                # Using warnings and removing the bad columns
-                # instead of throwing errors because some forms
-                # use choices column headers for notes.
-                if " " in headername:
-                    if headername not in warnedabout:
-                        warnedabout.add(headername)
-                        warnings.append(
-                            "On the choices sheet there is "
-                            + 'a column ("'
-                            + headername
-                            + '") with an illegal header. '
-                            + "Headers cannot include spaces."
-                        )
-                    del option[headername]
-                elif headername == "":
-                    warnings.append(
-                        "On the choices sheet there is a value"
-                        + " in a column with no header."
-                    )
-                    del option[headername]
-        list_name_choices = [option.get("name") for option in options]
-        if len(list_name_choices) != len(set(list_name_choices)):
-            duplicate_setting = settings.get("allow_choice_duplicates")
-            for v in Counter(list_name_choices).values():
-                if v > 1:
-                    if not duplicate_setting or duplicate_setting.capitalize() != "Yes":
-                        choice_duplicates = [
-                            item
-                            for item, count in Counter(list_name_choices).items()
-                            if count > 1
-                        ]
-
-                        if choice_duplicates:
-                            raise PyXFormError(
-                                "The name column for the '{}' choice list contains these duplicates: {}. Duplicate names "
-                                "will be impossible to identify in analysis unless a previous value in a cascading "
-                                "select differentiates them. If this is intentional, you can set the "
-                                "allow_choice_duplicates setting to 'yes'. Learn more: https://xlsform.org/#choice-names.".format(
-                                    list_name,
-                                    ", ".join(
-                                        [f"'{dupe}'" for dupe in choice_duplicates]
-                                    ),
-                                )
-                            )
+            for invalid_header in invalid_headers:
+                option.pop(invalid_header, None)
+            del option["__row"]
+
+    if 0 < len(choices):
+        json_dict[constants.CHOICES] = choices
 
     # ########## Entities sheet ###########
     entities_sheet = workbook_dict.get(constants.ENTITIES, [])
@@ -945,7 +910,7 @@ def workbook_to_json(
                     ROW_FORMAT_STRING % row_number + " Question or group with no name."
                 )
         question_name = str(row[constants.NAME])
-        if not is_valid_xml_tag(question_name):
+        if not is_xml_tag(question_name):
             if isinstance(question_name, bytes):
                 question_name = question_name.decode("utf-8")
 
@@ -1022,10 +987,7 @@ def workbook_to_json(
                 repeat_count_expression = new_json_dict.get("control", {}).get("jr:count")
                 if repeat_count_expression:
                     # Simple expressions don't require a new node, they can reference directly.
-                    simple_expression = is_single_token_expression(
-                        expression=repeat_count_expression, token_types=["PYXFORM_REF"]
-                    )
-                    if not simple_expression:
+                    if not is_pyxform_reference(value=repeat_count_expression):
                         generated_node_name = new_json_dict["name"] + "_count"
                         parent_children_array.append(
                             {

diff --git a/tests/parsing/__init__.py b/tests/parsing/__init__.py