Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions Doc/library/unicodedata.rst
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,36 @@ following functions:
1


.. function:: isxidstart(chr, /)

Return ``True`` if *chr* is a valid identifier start per the
`Unicode Standard Annex #31 <https://www.unicode.org/reports/tr31/>`_,
that is, it has the ``XID_Start`` property. Return ``False`` otherwise.
For example::

>>> unicodedata.isxidstart('S')
True
>>> unicodedata.isxidstart('0')
False

.. versionadded:: next


.. function:: isxidcontinue(chr, /)

Return ``True`` if *chr* is a valid identifier character per the
`Unicode Standard Annex #31 <https://www.unicode.org/reports/tr31/>`_,
that is, it has the ``XID_Continue`` property. Return ``False`` otherwise.
For example::

>>> unicodedata.isxidcontinue('S')
True
>>> unicodedata.isxidcontinue(' ')
False

.. versionadded:: next


.. function:: decomposition(chr)

Returns the character decomposition mapping assigned to the character
Expand Down
5 changes: 5 additions & 0 deletions Doc/whatsnew/3.15.rst
Original file line number Diff line number Diff line change
Expand Up @@ -794,6 +794,11 @@ unicodedata

* The Unicode database has been updated to Unicode 17.0.0.

* Add :func:`unicodedata.isxidstart` and :func:`unicodedata.isxidcontinue`
functions to check whether a character can start or continue a
`Unicode Standard Annex #31 <https://www.unicode.org/reports/tr31/>`_ identifier.
(Contributed by Stan Ulbrych in :gh:`129117`.)


wave
----
Expand Down
25 changes: 25 additions & 0 deletions Include/internal/pycore_unicodectype.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#ifndef Py_INTERNAL_UNICODECTYPE_H
#define Py_INTERNAL_UNICODECTYPE_H
#ifdef __cplusplus
extern "C" {
#endif

#ifndef Py_BUILD_CORE
# error "this header requires Py_BUILD_CORE define"
#endif

extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res);
extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res);
extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res);
extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res);
extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch);
extern int _PyUnicode_IsCased(Py_UCS4 ch);

// Export for 'unicodedata' shared extension.
PyAPI_FUNC(int) _PyUnicode_IsXidStart(Py_UCS4 ch);
PyAPI_FUNC(int) _PyUnicode_IsXidContinue(Py_UCS4 ch);

#ifdef __cplusplus
}
#endif
#endif /* !Py_INTERNAL_UNICODECTYPE_H */
12 changes: 0 additions & 12 deletions Include/internal/pycore_unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,18 +74,6 @@ _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
return 0;
}


/* --- Characters Type APIs ----------------------------------------------- */

extern int _PyUnicode_IsXidStart(Py_UCS4 ch);
extern int _PyUnicode_IsXidContinue(Py_UCS4 ch);
extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res);
extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res);
extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res);
extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res);
extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch);
extern int _PyUnicode_IsCased(Py_UCS4 ch);

/* --- Unicode API -------------------------------------------------------- */

// Export for '_json' shared extension
Expand Down
27 changes: 27 additions & 0 deletions Lib/test/test_unicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,33 @@ def test_east_asian_width_9_0_changes(self):
self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N')
self.assertEqual(self.db.east_asian_width('\u231a'), 'W')

def test_isxidstart(self):
self.assertTrue(self.db.isxidstart('S'))
self.assertTrue(self.db.isxidstart('\u0AD0')) # GUJARATI OM
self.assertTrue(self.db.isxidstart('\u0EC6')) # LAO KO LA
self.assertTrue(self.db.isxidstart('\u17DC')) # KHMER SIGN AVAKRAHASANYA
self.assertTrue(self.db.isxidstart('\uA015')) # YI SYLLABLE WU
self.assertTrue(self.db.isxidstart('\uFE7B')) # ARABIC KASRA MEDIAL FORM

self.assertFalse(self.db.isxidstart(' '))
self.assertFalse(self.db.isxidstart('0'))
self.assertRaises(TypeError, self.db.isxidstart)
self.assertRaises(TypeError, self.db.isxidstart, 'xx')

def test_isxidcontinue(self):
self.assertTrue(self.db.isxidcontinue('S'))
self.assertTrue(self.db.isxidcontinue('_'))
self.assertTrue(self.db.isxidcontinue('0'))
self.assertTrue(self.db.isxidcontinue('\u00BA')) # MASCULINE ORDINAL INDICATOR
self.assertTrue(self.db.isxidcontinue('\u0640')) # ARABIC TATWEEL
self.assertTrue(self.db.isxidcontinue('\u0710')) # SYRIAC LETTER ALAPH
self.assertTrue(self.db.isxidcontinue('\u0B3E')) # ORIYA VOWEL SIGN AA
self.assertTrue(self.db.isxidcontinue('\u17D7')) # KHMER SIGN LEK TOO

self.assertFalse(self.db.isxidcontinue(' '))
self.assertRaises(TypeError, self.db.isxidcontinue)
self.assertRaises(TypeError, self.db.isxidcontinue, 'xx')

class UnicodeMiscTest(UnicodeDatabaseTest):

@cpython_only
Expand Down
1 change: 1 addition & 0 deletions Makefile.pre.in
Original file line number Diff line number Diff line change
Expand Up @@ -1433,6 +1433,7 @@ PYTHON_HEADERS= \
$(srcdir)/Include/internal/pycore_typeobject.h \
$(srcdir)/Include/internal/pycore_typevarobject.h \
$(srcdir)/Include/internal/pycore_ucnhash.h \
$(srcdir)/Include/internal/pycore_unicodectype.h \
$(srcdir)/Include/internal/pycore_unicodeobject.h \
$(srcdir)/Include/internal/pycore_unicodeobject_generated.h \
$(srcdir)/Include/internal/pycore_unionobject.h \
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
:mod:`unicodedata`: Add :func:`~unicodedata.isxidstart` and
:func:`~unicodedata.isxidcontinue` functions to check whether a character can
start or continue a `Unicode Standard Annex #31 <https://www.unicode.org/reports/tr31/>`_ identifier.
74 changes: 73 additions & 1 deletion Modules/clinic/unicodedata.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

55 changes: 55 additions & 0 deletions Modules/unicodedata.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "Python.h"
#include "pycore_object.h" // _PyObject_VisitType()
#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
#include "pycore_unicodectype.h" // _PyUnicode_IsXidStart()

#include <stdbool.h>
#include <stddef.h> // offsetof()
Expand Down Expand Up @@ -1525,6 +1526,58 @@ unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
return PyUnicode_FromString(name);
}

/*[clinic input]
unicodedata.UCD.isxidstart

self: self
chr: int(accept={str})
/

Return True if the character has the XID_Start property, else False.

[clinic start generated code]*/

static PyObject *
unicodedata_UCD_isxidstart_impl(PyObject *self, int chr)
/*[clinic end generated code: output=944005823c72c3ef input=9353f88d709c21fb]*/
{
if (UCD_Check(self)) {
const change_record *old = get_old_record(self, chr);
if (old->category_changed == 0) {
/* unassigned */
Py_RETURN_FALSE;
}
}

return PyBool_FromLong(_PyUnicode_IsXidStart(chr));
}

/*[clinic input]
unicodedata.UCD.isxidcontinue

self: self
chr: int(accept={str})
/

Return True if the character has the XID_Continue property, else False.

[clinic start generated code]*/

static PyObject *
unicodedata_UCD_isxidcontinue_impl(PyObject *self, int chr)
/*[clinic end generated code: output=9438dcbff5ca3e41 input=bbb8dd3ac0d2d709]*/
{
if (UCD_Check(self)) {
const change_record *old = get_old_record(self, chr);
if (old->category_changed == 0) {
/* unassigned */
Py_RETURN_FALSE;
}
}

return PyBool_FromLong(_PyUnicode_IsXidContinue(chr));
}

/*[clinic input]
unicodedata.UCD.lookup

Expand Down Expand Up @@ -1590,6 +1643,8 @@ static PyMethodDef unicodedata_functions[] = {
UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
UNICODEDATA_UCD_NAME_METHODDEF
UNICODEDATA_UCD_ISXIDSTART_METHODDEF
UNICODEDATA_UCD_ISXIDCONTINUE_METHODDEF
UNICODEDATA_UCD_LOOKUP_METHODDEF
UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
UNICODEDATA_UCD_NORMALIZE_METHODDEF
Expand Down
1 change: 1 addition & 0 deletions Objects/unicodectype.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
*/

#include "Python.h"
#include "pycore_unicodectype.h" // export _PyUnicode_IsXidStart(), _PyUnicode_IsXidContinue()

#define ALPHA_MASK 0x01
#define DECIMAL_MASK 0x02
Expand Down
1 change: 1 addition & 0 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
#include "pycore_pystate.h" // _PyInterpreterState_GET()
#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
#include "pycore_unicodectype.h" // _PyUnicode_IsXidStart
#include "pycore_unicodeobject.h" // struct _Py_unicode_state
#include "pycore_unicodeobject_generated.h" // _PyUnicode_InitStaticStrings()

Expand Down
1 change: 1 addition & 0 deletions PCbuild/pythoncore.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,7 @@
<ClInclude Include="..\Include\internal\pycore_typevarobject.h" />
<ClInclude Include="..\Include\internal\pycore_ucnhash.h" />
<ClInclude Include="..\Include\internal\pycore_unionobject.h" />
<ClInclude Include="..\Include\internal\pycore_unicodectype.h" />
<ClInclude Include="..\Include\internal\pycore_unicodeobject.h" />
<ClInclude Include="..\Include\internal\pycore_unicodeobject_generated.h" />
<ClInclude Include="..\Include\internal\pycore_uniqueid.h" />
Expand Down
3 changes: 3 additions & 0 deletions PCbuild/pythoncore.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,9 @@
<ClInclude Include="..\Include\cpython\initconfig.h">
<Filter>Include\cpython</Filter>
</ClInclude>
<ClInclude Include="..\Include\internal\pycore_unicodectype.h">
<Filter>Include\internal</Filter>
</ClInclude>
<ClInclude Include="..\Include\internal\pycore_unicodeobject.h">
<Filter>Include\internal</Filter>
</ClInclude>
Expand Down
Loading