Allow the "lang_AREA" form when normalizing values for the language f…

…acet.
whiskyechobravo · Oct 5, 2024 · a8cafc9 · a8cafc9
1 parent ac432b1
commit a8cafc9
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 19 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,18 @@
 Before doing an upgrade, please check the "How to upgrade" section of the Kerko
 documentation.
 
+## Unreleased
+
+New features:
+
+- Allow the "lang_AREA" form (e.g., "en_US", "fr_FR") when normalizing language
+  values for the "Resource language" facet.
+
+Other changes:
+
+- Improve documentation.
+
+
 ## 1.2.0 (2024-08-03)
 
 New features:

diff --git a/src/kerko/extractors.py b/src/kerko/extractors.py
@@ -462,24 +462,7 @@ def extract(self, item, library_context, spec):  # noqa: ARG002
         """
         Extract item language(s) into (value, label) tuples.
 
-        When normalizing, this looks into the language database to replace the
-        item's language value by its corresponding ISO 639-3 code and language
-        name. If the value has the form "lang-AREA", "AREA" is ignored when
-        searching a corresponding language. Matching is case-insensitive and
-        proceeds in the following order, stopping at the first match found:
-
-        1. Search a matching 3-letter code from ISO 639-3.
-        2. Search a matching 3-letter bibliographic (B) code from ISO 639-2.
-        3. Search a matching 2-letter code from ISO 639-1.
-        4. Search a matching English language name.
-
-        When a matching language is found, the encoded value uses the 3-letter
-        ISO 639-3 code, and the encoded label is the language name, translated
-        in the locale specified at initialization time.
-
-        When no matching language is found, the value is encoded in its
-        lowercase form, and the label is normalized using the `normalize_case`
-        callable specified at initialization time.
+        Multiple values are separated using the `self.values_separator` regex.
         """
         values = self.values_separator.split(item.get("data", {}).get("language", ""))
         if self.normalize:
@@ -490,8 +473,30 @@ def extract(self, item, library_context, spec):  # noqa: ARG002
         return [value for value in dict.fromkeys(values).keys() if value] or None
 
     def normalize_language(self, value):
+        """
+        Given a str value, return a corresponding (language code, name) tuple.
+
+        This searches the language database and tries to find an ISO 639-3 code
+        corresponding to the given value. If the value has the form "lang-AREA"
+        or "lang_AREA", "AREA" is ignored when searching for a language code.
+        Matching is case-insensitive and proceeds in the following order,
+        stopping at the first match found:
+
+        1. Search for a 3-letter ISO 639-3 code.
+        2. Search for a 3-letter ISO 639-2 bibliographic (B) code.
+        3. Search for a 2-letter ISO 639-1 code.
+        4. Search for an English language name.
+
+        If a matching language is found, a tuple is returned with the 3-letter
+        ISO 639-3 code, and the language name. The language name is translated
+        in the locale specified by `self.locale`.
+
+        If no matching language is found, a tuple is returned with the value
+        converted to lowercase form, and a label. The label is normalized using
+        the `self.normalize_case` callable.
+        """
         value = value.strip()
-        lang = value.split("-", maxsplit=1)[0]
+        lang = re.split(r"[-_]", value, maxsplit=1)[0]
         match = None
         if len(lang) == 3:  # noqa: PLR2004
             match = pycountry.languages.get(alpha_3=lang)

diff --git a/tests/test_extractors.py b/tests/test_extractors.py
@@ -46,6 +46,8 @@ def test_iso639_eng_alpha2(self):
     def test_iso639_eng_alpha2_area(self):
         result = self.do_extract_test("en-US")
         self.assertListEqual(result, [("eng", "English")])
+        result = self.do_extract_test("en_US")
+        self.assertListEqual(result, [("eng", "English")])
 
     def test_iso639_eng_alpha3(self):
         result = self.do_extract_test("eng")
@@ -106,6 +108,8 @@ def test_unknown_normalize_invalid(self):
     def test_unknown_name_area(self):
         result = self.do_extract_test("xx-XX")
         self.assertListEqual(result, [("xx-xx", "Xx-Xx")])
+        result = self.do_extract_test("xx_XX")
+        self.assertListEqual(result, [("xx_xx", "Xx_Xx")])
 
     def test_unknown_name_invalid(self):
         result = self.do_extract_test("Newspeak", allow_invalid=False)