Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions Doc/library/unicodedata.rst
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,32 @@ following functions:
1


.. function:: isidstart(chr, /)

Return ``True`` if the character has the ``XID_Start`` property, ``False``
otherwise. For example::

>>> unicodedata.isidstart('S')
True
>>> unicodedata.isidstart('0')
False

.. versionadded:: next


.. function:: isidcontinue(chr, /)

Return ``True`` if the character has the ``XID_Continue`` property, ``False``
otherwise. For example::

>>> unicodedata.isidcontinue('S')
True
>>> unicodedata.isidcontinue(' ')
False

.. versionadded:: next


.. function:: decomposition(chr)

Returns the character decomposition mapping assigned to the character
Expand Down
4 changes: 4 additions & 0 deletions Doc/whatsnew/3.15.rst
Original file line number Diff line number Diff line change
Expand Up @@ -768,6 +768,10 @@ unicodedata

* The Unicode database has been updated to Unicode 17.0.0.

* Add :func:`unicodedata.isidstart` and :func:`unicodedata.isidcontinue`
functions.
(Contributed by Stan Ulbrych in :gh:`129117`.)


wave
----
Expand Down
8 changes: 8 additions & 0 deletions Include/cpython/unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -733,6 +733,14 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Py_UCS4 ch /* Unicode character */
);

PyAPI_FUNC(int) _PyUnicode_IsXidStart(
Py_UCS4 ch /* Unicode character */
);

PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
Py_UCS4 ch /* Unicode character */
);

// Helper array used by Py_UNICODE_ISSPACE().
PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];

Expand Down
4 changes: 2 additions & 2 deletions Include/internal/pycore_unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)

/* --- Characters Type APIs ----------------------------------------------- */

extern int _PyUnicode_IsXidStart(Py_UCS4 ch);
extern int _PyUnicode_IsXidContinue(Py_UCS4 ch);
PyAPI_FUNC(int) _PyUnicode_IsXidStart(Py_UCS4 ch);
PyAPI_FUNC(int) _PyUnicode_IsXidContinue(Py_UCS4 ch);
extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res);
extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res);
extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res);
Expand Down
27 changes: 27 additions & 0 deletions Lib/test/test_unicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,33 @@ def test_east_asian_width_9_0_changes(self):
self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N')
self.assertEqual(self.db.east_asian_width('\u231a'), 'W')

def test_isidstart(self):
self.assertTrue(self.db.isidstart('S'))
self.assertTrue(self.db.isidstart('\u0AD0')) # GUJARATI OM
self.assertTrue(self.db.isidstart('\u0EC6')) # LAO KO LA
self.assertTrue(self.db.isidstart('\u17DC')) # KHMER SIGN AVAKRAHASANYA
self.assertTrue(self.db.isidstart('\uA015')) # YI SYLLABLE WU
self.assertTrue(self.db.isidstart('\uFE7B')) # ARABIC KASRA MEDIAL FORM

self.assertFalse(self.db.isidstart(' '))
self.assertFalse(self.db.isidstart('0'))
self.assertRaises(TypeError, self.db.isidstart)
self.assertRaises(TypeError, self.db.isidstart, 'xx')

def test_isidcontinue(self):
self.assertTrue(self.db.isidcontinue('S'))
self.assertTrue(self.db.isidcontinue('_'))
self.assertTrue(self.db.isidcontinue('0'))
self.assertTrue(self.db.isidcontinue('\u00BA')) # MASCULINE ORDINAL INDICATOR
self.assertTrue(self.db.isidcontinue('\u0640')) # ARABIC TATWEEL
self.assertTrue(self.db.isidcontinue('\u0710')) # SYRIAC LETTER ALAPH
self.assertTrue(self.db.isidcontinue('\u0B3E')) # ORIYA VOWEL SIGN AA
self.assertTrue(self.db.isidcontinue('\u17D7')) # KHMER SIGN LEK TOO

self.assertFalse(self.db.isidcontinue(' '))
self.assertRaises(TypeError, self.db.isidcontinue)
self.assertRaises(TypeError, self.db.isidcontinue, 'xx')

class UnicodeMiscTest(UnicodeDatabaseTest):

@cpython_only
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
:mod:`unicodedata`: Add :func:`~unicodedata.isidstart` and
:func:`~unicodedata.isidcontinue` functions.
74 changes: 73 additions & 1 deletion Modules/clinic/unicodedata.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

58 changes: 58 additions & 0 deletions Modules/unicodedata.c
Original file line number Diff line number Diff line change
Expand Up @@ -1525,6 +1525,62 @@ unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
return PyUnicode_FromString(name);
}

/*[clinic input]
unicodedata.UCD.isidstart

self: self
chr: int(accept={str})
/

Return True if the character has the XID_Start property, else False.

[clinic start generated code]*/

static PyObject *
unicodedata_UCD_isidstart_impl(PyObject *self, int chr)
/*[clinic end generated code: output=29fbeaf6491d9f85 input=b71b6b1b2db3c16d]*/
{
Py_UCS4 c = (Py_UCS4)chr;

if (UCD_Check(self)) {
const change_record *old = get_old_record(self, c);
if (old->category_changed == 0) {
/* unassigned */
Py_RETURN_FALSE;
}
}

return PyBool_FromLong(_PyUnicode_IsXidStart(c));
}

/*[clinic input]
unicodedata.UCD.isidcontinue

self: self
chr: int(accept={str})
/

Return True if the character has the XID_Continue property, else False.

[clinic start generated code]*/

static PyObject *
unicodedata_UCD_isidcontinue_impl(PyObject *self, int chr)
/*[clinic end generated code: output=5ae694da0ee16534 input=01b4ccd399484e6b]*/
{
Py_UCS4 c = (Py_UCS4)chr;

if (UCD_Check(self)) {
const change_record *old = get_old_record(self, c);
if (old->category_changed == 0) {
/* unassigned */
Py_RETURN_FALSE;
}
}

return PyBool_FromLong(_PyUnicode_IsXidContinue(c));
}

/*[clinic input]
unicodedata.UCD.lookup

Expand Down Expand Up @@ -1590,6 +1646,8 @@ static PyMethodDef unicodedata_functions[] = {
UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
UNICODEDATA_UCD_NAME_METHODDEF
UNICODEDATA_UCD_ISIDSTART_METHODDEF
UNICODEDATA_UCD_ISIDCONTINUE_METHODDEF
UNICODEDATA_UCD_LOOKUP_METHODDEF
UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
UNICODEDATA_UCD_NORMALIZE_METHODDEF
Expand Down
Loading