From 64e40d89ddb54b192d84132e6540fcc6553a8050 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Fri, 21 Feb 2025 11:37:21 -0600 Subject: [PATCH 1/6] Speed up creating Mark objects in the Cython version --- yaml/_yaml.pyx | 60 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/yaml/_yaml.pyx b/yaml/_yaml.pyx index e3e93e2ce..f0a7545c1 100644 --- a/yaml/_yaml.pyx +++ b/yaml/_yaml.pyx @@ -68,6 +68,10 @@ cdef class Mark: def __init__(self, object name, size_t index, size_t line, size_t column, object buffer, object pointer): + self._fast_init(name, index, line, column, buffer, pointer) + + cdef _fast_init(self, object name, size_t index, size_t line, size_t column, + object buffer, object pointer): self.name = name self.index = index self.line = line @@ -300,6 +304,8 @@ cdef class CParser: pass cdef object _parser_error(self): + cdef Mark context_mark + cdef Mark problem_mark if self.parser.error == YAML_MEMORY_ERROR: return MemoryError elif self.parser.error == YAML_READER_ERROR: @@ -307,15 +313,15 @@ cdef class CParser: self.parser.problem_value, u'?', PyUnicode_FromString(self.parser.problem)) elif self.parser.error == YAML_SCANNER_ERROR \ or self.parser.error == YAML_PARSER_ERROR: - context_mark = None - problem_mark = None if self.parser.context != NULL: - context_mark = Mark(self.stream_name, + context_mark = Mark.__new__(Mark) + context_mark._fast_init(self.stream_name, self.parser.context_mark.index, self.parser.context_mark.line, self.parser.context_mark.column, None, None) if self.parser.problem != NULL: - problem_mark = Mark(self.stream_name, + problem_mark = Mark.__new__(Mark) + problem_mark._fast_init(self.stream_name, self.parser.problem_mark.index, self.parser.problem_mark.line, self.parser.problem_mark.column, None, None) @@ -356,12 +362,14 @@ cdef class CParser: return token_object cdef object _token_to_object(self, yaml_token_t *token): - start_mark = Mark(self.stream_name, + cdef Mark start_mark = Mark.__new__(Mark) + cdef Mark end_mark = Mark.__new__(Mark) + start_mark._fast_init(self.stream_name, token.start_mark.index, token.start_mark.line, token.start_mark.column, None, None) - end_mark = Mark(self.stream_name, + end_mark._fast_init(self.stream_name, token.end_mark.index, token.end_mark.line, token.end_mark.column, @@ -503,12 +511,14 @@ cdef class CParser: cdef object _event_to_object(self, yaml_event_t *event): cdef yaml_tag_directive_t *tag_directive - start_mark = Mark(self.stream_name, + cdef Mark start_mark = Mark.__new__(Mark) + cdef Mark end_mark = Mark.__new__(Mark) + start_mark._fast_init(self.stream_name, event.start_mark.index, event.start_mark.line, event.start_mark.column, None, None) - end_mark = Mark(self.stream_name, + end_mark._fast_init(self.stream_name, event.end_mark.index, event.end_mark.line, event.end_mark.column, @@ -665,6 +675,7 @@ cdef class CParser: return self._compose_document() def get_single_node(self): + cdef Mark mark self._parse_next_event() yaml_event_delete(&self.parsed_event) self._parse_next_event() @@ -673,7 +684,8 @@ cdef class CParser: document = self._compose_document() self._parse_next_event() if self.parsed_event.type != YAML_STREAM_END_EVENT: - mark = Mark(self.stream_name, + mark = Mark.__new__(Mark) + mark._fast_init(self.stream_name, self.parsed_event.start_mark.index, self.parsed_event.start_mark.line, self.parsed_event.start_mark.column, @@ -691,11 +703,13 @@ cdef class CParser: return node cdef object _compose_node(self, object parent, object index): + cdef Mark mark self._parse_next_event() if self.parsed_event.type == YAML_ALIAS_EVENT: anchor = PyUnicode_FromYamlString(self.parsed_event.data.alias.anchor) if anchor not in self.anchors: - mark = Mark(self.stream_name, + mark = Mark.__new__(Mark) + mark._fast_init(self.stream_name, self.parsed_event.start_mark.index, self.parsed_event.start_mark.line, self.parsed_event.start_mark.column, @@ -715,7 +729,8 @@ cdef class CParser: anchor = PyUnicode_FromYamlString(self.parsed_event.data.mapping_start.anchor) if anchor is not None: if anchor in self.anchors: - mark = Mark(self.stream_name, + mark = Mark.__new__(Mark) + mark._fast_init(self.stream_name, self.parsed_event.start_mark.index, self.parsed_event.start_mark.line, self.parsed_event.start_mark.column, @@ -733,12 +748,14 @@ cdef class CParser: return node cdef _compose_scalar_node(self, object anchor): - start_mark = Mark(self.stream_name, + cdef Mark start_mark = Mark.__new__(Mark) + cdef Mark end_mark = Mark.__new__(Mark) + start_mark._fast_init(self.stream_name, self.parsed_event.start_mark.index, self.parsed_event.start_mark.line, self.parsed_event.start_mark.column, None, None) - end_mark = Mark(self.stream_name, + end_mark._fast_init(self.stream_name, self.parsed_event.end_mark.index, self.parsed_event.end_mark.line, self.parsed_event.end_mark.column, @@ -776,7 +793,9 @@ cdef class CParser: cdef _compose_sequence_node(self, object anchor): cdef int index - start_mark = Mark(self.stream_name, + cdef Mark start_mark = Mark.__new__(Mark) + cdef Mark end_mark = Mark.__new__(Mark) + start_mark._fast_init(self.stream_name, self.parsed_event.start_mark.index, self.parsed_event.start_mark.line, self.parsed_event.start_mark.column, @@ -806,16 +825,20 @@ cdef class CParser: value.append(self._compose_node(node, index)) index = index+1 self._parse_next_event() - node.end_mark = Mark(self.stream_name, + + end_mark._fast_init(self.stream_name, self.parsed_event.end_mark.index, self.parsed_event.end_mark.line, self.parsed_event.end_mark.column, None, None) + node.end_mark = end_mark yaml_event_delete(&self.parsed_event) return node cdef _compose_mapping_node(self, object anchor): - start_mark = Mark(self.stream_name, + cdef Mark start_mark = Mark.__new__(Mark) + cdef Mark end_mark = Mark.__new__(Mark) + start_mark._fast_init(self.stream_name, self.parsed_event.start_mark.index, self.parsed_event.start_mark.line, self.parsed_event.start_mark.column, @@ -845,11 +868,12 @@ cdef class CParser: item_value = self._compose_node(node, item_key) value.append((item_key, item_value)) self._parse_next_event() - node.end_mark = Mark(self.stream_name, + end_mark._fast_init(self.stream_name, self.parsed_event.end_mark.index, self.parsed_event.end_mark.line, self.parsed_event.end_mark.column, None, None) + node.end_mark = end_mark yaml_event_delete(&self.parsed_event) return node @@ -912,7 +936,7 @@ cdef class CEmitter: if hasattr(stream, u'encoding'): self.dump_unicode = 1 self.use_encoding = encoding - yaml_emitter_set_output(&self.emitter, output_handler, self) + yaml_emitter_set_output(&self.emitter, output_handler, self) if canonical: yaml_emitter_set_canonical(&self.emitter, 1) if indent is not None: From 964c641aad18b5df3ef92b9202a7b82283939d9e Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Fri, 16 May 2025 14:19:47 -0400 Subject: [PATCH 2/6] Add `_create_mark` helper --- yaml/_yaml.pyx | 53 ++++++++++++++++++++++---------------------------- 1 file changed, 23 insertions(+), 30 deletions(-) diff --git a/yaml/_yaml.pyx b/yaml/_yaml.pyx index f0a7545c1..86f138118 100644 --- a/yaml/_yaml.pyx +++ b/yaml/_yaml.pyx @@ -87,6 +87,13 @@ cdef class Mark: % (self.name, self.line+1, self.column+1) return where +# Helper function to create a Mark object with a single call +cdef Mark _create_mark(object name, size_t index, size_t line, size_t column, + object buffer, object pointer): + cdef Mark mark = Mark.__new__(Mark) + mark._fast_init(name, index, line, column, buffer, pointer) + return mark + #class YAMLError(Exception): # pass # @@ -314,14 +321,12 @@ cdef class CParser: elif self.parser.error == YAML_SCANNER_ERROR \ or self.parser.error == YAML_PARSER_ERROR: if self.parser.context != NULL: - context_mark = Mark.__new__(Mark) - context_mark._fast_init(self.stream_name, + context_mark = _create_mark(self.stream_name, self.parser.context_mark.index, self.parser.context_mark.line, self.parser.context_mark.column, None, None) if self.parser.problem != NULL: - problem_mark = Mark.__new__(Mark) - problem_mark._fast_init(self.stream_name, + problem_mark = _create_mark(self.stream_name, self.parser.problem_mark.index, self.parser.problem_mark.line, self.parser.problem_mark.column, None, None) @@ -362,14 +367,12 @@ cdef class CParser: return token_object cdef object _token_to_object(self, yaml_token_t *token): - cdef Mark start_mark = Mark.__new__(Mark) - cdef Mark end_mark = Mark.__new__(Mark) - start_mark._fast_init(self.stream_name, + cdef Mark start_mark = _create_mark(self.stream_name, token.start_mark.index, token.start_mark.line, token.start_mark.column, None, None) - end_mark._fast_init(self.stream_name, + cdef Mark end_mark = _create_mark(self.stream_name, token.end_mark.index, token.end_mark.line, token.end_mark.column, @@ -511,14 +514,12 @@ cdef class CParser: cdef object _event_to_object(self, yaml_event_t *event): cdef yaml_tag_directive_t *tag_directive - cdef Mark start_mark = Mark.__new__(Mark) - cdef Mark end_mark = Mark.__new__(Mark) - start_mark._fast_init(self.stream_name, + cdef Mark start_mark = _create_mark(self.stream_name, event.start_mark.index, event.start_mark.line, event.start_mark.column, None, None) - end_mark._fast_init(self.stream_name, + cdef Mark end_mark = _create_mark(self.stream_name, event.end_mark.index, event.end_mark.line, event.end_mark.column, @@ -684,8 +685,7 @@ cdef class CParser: document = self._compose_document() self._parse_next_event() if self.parsed_event.type != YAML_STREAM_END_EVENT: - mark = Mark.__new__(Mark) - mark._fast_init(self.stream_name, + mark = _create_mark(self.stream_name, self.parsed_event.start_mark.index, self.parsed_event.start_mark.line, self.parsed_event.start_mark.column, @@ -708,8 +708,7 @@ cdef class CParser: if self.parsed_event.type == YAML_ALIAS_EVENT: anchor = PyUnicode_FromYamlString(self.parsed_event.data.alias.anchor) if anchor not in self.anchors: - mark = Mark.__new__(Mark) - mark._fast_init(self.stream_name, + mark = _create_mark(self.stream_name, self.parsed_event.start_mark.index, self.parsed_event.start_mark.line, self.parsed_event.start_mark.column, @@ -729,8 +728,7 @@ cdef class CParser: anchor = PyUnicode_FromYamlString(self.parsed_event.data.mapping_start.anchor) if anchor is not None: if anchor in self.anchors: - mark = Mark.__new__(Mark) - mark._fast_init(self.stream_name, + mark = _create_mark(self.stream_name, self.parsed_event.start_mark.index, self.parsed_event.start_mark.line, self.parsed_event.start_mark.column, @@ -748,14 +746,12 @@ cdef class CParser: return node cdef _compose_scalar_node(self, object anchor): - cdef Mark start_mark = Mark.__new__(Mark) - cdef Mark end_mark = Mark.__new__(Mark) - start_mark._fast_init(self.stream_name, + cdef Mark start_mark = _create_mark(self.stream_name, self.parsed_event.start_mark.index, self.parsed_event.start_mark.line, self.parsed_event.start_mark.column, None, None) - end_mark._fast_init(self.stream_name, + cdef Mark end_mark = _create_mark(self.stream_name, self.parsed_event.end_mark.index, self.parsed_event.end_mark.line, self.parsed_event.end_mark.column, @@ -793,13 +789,12 @@ cdef class CParser: cdef _compose_sequence_node(self, object anchor): cdef int index - cdef Mark start_mark = Mark.__new__(Mark) - cdef Mark end_mark = Mark.__new__(Mark) - start_mark._fast_init(self.stream_name, + cdef Mark start_mark = _create_mark(self.stream_name, self.parsed_event.start_mark.index, self.parsed_event.start_mark.line, self.parsed_event.start_mark.column, None, None) + cdef Mark end_mark implicit = False if self.parsed_event.data.sequence_start.implicit == 1: implicit = True @@ -826,7 +821,7 @@ cdef class CParser: index = index+1 self._parse_next_event() - end_mark._fast_init(self.stream_name, + end_mark = _create_mark(self.stream_name, self.parsed_event.end_mark.index, self.parsed_event.end_mark.line, self.parsed_event.end_mark.column, @@ -836,9 +831,7 @@ cdef class CParser: return node cdef _compose_mapping_node(self, object anchor): - cdef Mark start_mark = Mark.__new__(Mark) - cdef Mark end_mark = Mark.__new__(Mark) - start_mark._fast_init(self.stream_name, + cdef Mark start_mark = _create_mark(self.stream_name, self.parsed_event.start_mark.index, self.parsed_event.start_mark.line, self.parsed_event.start_mark.column, @@ -868,7 +861,7 @@ cdef class CParser: item_value = self._compose_node(node, item_key) value.append((item_key, item_value)) self._parse_next_event() - end_mark._fast_init(self.stream_name, + cdef Mark end_mark = _create_mark(self.stream_name, self.parsed_event.end_mark.index, self.parsed_event.end_mark.line, self.parsed_event.end_mark.column, From 688ebe766c4c9e4cb4e3d79aa9cfc5fbb9c0f097 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Fri, 16 May 2025 14:50:57 -0400 Subject: [PATCH 3/6] avoid all the ref counting --- yaml/_yaml.pyx | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/yaml/_yaml.pyx b/yaml/_yaml.pyx index 86f138118..43e9b5c3a 100644 --- a/yaml/_yaml.pyx +++ b/yaml/_yaml.pyx @@ -68,10 +68,6 @@ cdef class Mark: def __init__(self, object name, size_t index, size_t line, size_t column, object buffer, object pointer): - self._fast_init(name, index, line, column, buffer, pointer) - - cdef _fast_init(self, object name, size_t index, size_t line, size_t column, - object buffer, object pointer): self.name = name self.index = index self.line = line @@ -91,7 +87,12 @@ cdef class Mark: cdef Mark _create_mark(object name, size_t index, size_t line, size_t column, object buffer, object pointer): cdef Mark mark = Mark.__new__(Mark) - mark._fast_init(name, index, line, column, buffer, pointer) + mark.name = name + mark.index = index + mark.line = line + mark.column = column + mark.buffer = buffer + mark.pointer = pointer return mark #class YAMLError(Exception): From 084815363106e5c48933ff78ddb5baea6f91f150 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Fri, 16 May 2025 14:58:38 -0400 Subject: [PATCH 4/6] Reduce diff size --- yaml/_yaml.pyx | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/yaml/_yaml.pyx b/yaml/_yaml.pyx index 43e9b5c3a..cdad43b8d 100644 --- a/yaml/_yaml.pyx +++ b/yaml/_yaml.pyx @@ -312,8 +312,6 @@ cdef class CParser: pass cdef object _parser_error(self): - cdef Mark context_mark - cdef Mark problem_mark if self.parser.error == YAML_MEMORY_ERROR: return MemoryError elif self.parser.error == YAML_READER_ERROR: @@ -321,6 +319,8 @@ cdef class CParser: self.parser.problem_value, u'?', PyUnicode_FromString(self.parser.problem)) elif self.parser.error == YAML_SCANNER_ERROR \ or self.parser.error == YAML_PARSER_ERROR: + context_mark = None + problem_mark = None if self.parser.context != NULL: context_mark = _create_mark(self.stream_name, self.parser.context_mark.index, @@ -368,12 +368,12 @@ cdef class CParser: return token_object cdef object _token_to_object(self, yaml_token_t *token): - cdef Mark start_mark = _create_mark(self.stream_name, + start_mark = _create_mark(self.stream_name, token.start_mark.index, token.start_mark.line, token.start_mark.column, None, None) - cdef Mark end_mark = _create_mark(self.stream_name, + end_mark = _create_mark(self.stream_name, token.end_mark.index, token.end_mark.line, token.end_mark.column, @@ -515,12 +515,12 @@ cdef class CParser: cdef object _event_to_object(self, yaml_event_t *event): cdef yaml_tag_directive_t *tag_directive - cdef Mark start_mark = _create_mark(self.stream_name, + start_mark = _create_mark(self.stream_name, event.start_mark.index, event.start_mark.line, event.start_mark.column, None, None) - cdef Mark end_mark = _create_mark(self.stream_name, + end_mark = _create_mark(self.stream_name, event.end_mark.index, event.end_mark.line, event.end_mark.column, @@ -677,7 +677,6 @@ cdef class CParser: return self._compose_document() def get_single_node(self): - cdef Mark mark self._parse_next_event() yaml_event_delete(&self.parsed_event) self._parse_next_event() @@ -704,7 +703,6 @@ cdef class CParser: return node cdef object _compose_node(self, object parent, object index): - cdef Mark mark self._parse_next_event() if self.parsed_event.type == YAML_ALIAS_EVENT: anchor = PyUnicode_FromYamlString(self.parsed_event.data.alias.anchor) @@ -747,12 +745,12 @@ cdef class CParser: return node cdef _compose_scalar_node(self, object anchor): - cdef Mark start_mark = _create_mark(self.stream_name, + start_mark = _create_mark(self.stream_name, self.parsed_event.start_mark.index, self.parsed_event.start_mark.line, self.parsed_event.start_mark.column, None, None) - cdef Mark end_mark = _create_mark(self.stream_name, + end_mark = _create_mark(self.stream_name, self.parsed_event.end_mark.index, self.parsed_event.end_mark.line, self.parsed_event.end_mark.column, @@ -790,12 +788,11 @@ cdef class CParser: cdef _compose_sequence_node(self, object anchor): cdef int index - cdef Mark start_mark = _create_mark(self.stream_name, + start_mark = _create_mark(self.stream_name, self.parsed_event.start_mark.index, self.parsed_event.start_mark.line, self.parsed_event.start_mark.column, None, None) - cdef Mark end_mark implicit = False if self.parsed_event.data.sequence_start.implicit == 1: implicit = True @@ -821,18 +818,16 @@ cdef class CParser: value.append(self._compose_node(node, index)) index = index+1 self._parse_next_event() - - end_mark = _create_mark(self.stream_name, + node.end_mark = _create_mark(self.stream_name, self.parsed_event.end_mark.index, self.parsed_event.end_mark.line, self.parsed_event.end_mark.column, None, None) - node.end_mark = end_mark yaml_event_delete(&self.parsed_event) return node cdef _compose_mapping_node(self, object anchor): - cdef Mark start_mark = _create_mark(self.stream_name, + start_mark = _create_mark(self.stream_name, self.parsed_event.start_mark.index, self.parsed_event.start_mark.line, self.parsed_event.start_mark.column, @@ -862,12 +857,11 @@ cdef class CParser: item_value = self._compose_node(node, item_key) value.append((item_key, item_value)) self._parse_next_event() - cdef Mark end_mark = _create_mark(self.stream_name, + node.end_mark = _create_mark(self.stream_name, self.parsed_event.end_mark.index, self.parsed_event.end_mark.line, self.parsed_event.end_mark.column, None, None) - node.end_mark = end_mark yaml_event_delete(&self.parsed_event) return node From 4192cdab8cc2b523f1ffde976186167b4d2a64e6 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Fri, 16 May 2025 15:01:21 -0400 Subject: [PATCH 5/6] remove whitespace --- yaml/_yaml.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yaml/_yaml.pyx b/yaml/_yaml.pyx index cdad43b8d..dc87d7795 100644 --- a/yaml/_yaml.pyx +++ b/yaml/_yaml.pyx @@ -320,7 +320,7 @@ cdef class CParser: elif self.parser.error == YAML_SCANNER_ERROR \ or self.parser.error == YAML_PARSER_ERROR: context_mark = None - problem_mark = None + problem_mark = None if self.parser.context != NULL: context_mark = _create_mark(self.stream_name, self.parser.context_mark.index, From aa051e8e1e02588a08fe4bd93fdcaa90ba954eeb Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Fri, 16 May 2025 15:02:52 -0400 Subject: [PATCH 6/6] document --- yaml/_yaml.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/yaml/_yaml.pyx b/yaml/_yaml.pyx index dc87d7795..88fd5ea79 100644 --- a/yaml/_yaml.pyx +++ b/yaml/_yaml.pyx @@ -84,6 +84,8 @@ cdef class Mark: return where # Helper function to create a Mark object with a single call +# This version does not have to convert size_t back to Python +# which makes it quite a bit faster when called from Cython cdef Mark _create_mark(object name, size_t index, size_t line, size_t column, object buffer, object pointer): cdef Mark mark = Mark.__new__(Mark)