@@ -128,104 +128,92 @@ def inpxml_todict(parent: etree._Element,
128
128
129
129
:return: a python dictionary
130
130
"""
131
+ #These keys have to never appear as an attribute/tag name
132
+ #The underscores should guarantee that
133
+ _TEXT_PLACEHOLDER = '__text__'
134
+ _OMIT_PLACEHOLDER = '__omit__'
131
135
132
136
#Check if this is the first call to this routine
133
137
if base_xpath is None :
134
138
base_xpath = f'/{ parent .tag } '
135
139
136
- return_dict : dict [str , Any ] = {}
137
- if list (parent .items ()):
138
- return_dict = {str (key ): val for key , val in parent .items ()}
139
- # Now we have to convert lazy fortran style into pretty things for the Database
140
- for key in return_dict :
141
- if key in schema_dict ['attrib_types' ]:
142
- return_dict [key ], suc = convert_from_xml (return_dict [key ],
140
+ content : dict [str , Any ] = {}
141
+ # Now we have to convert lazy fortran style into pretty things for the Database
142
+ for key , value in parent .items ():
143
+ attrib_name , value = str (key ), str (value )
144
+ if attrib_name in schema_dict ['attrib_types' ]:
145
+ content [attrib_name ], suc = convert_from_xml (value ,
143
146
schema_dict ,
144
- key ,
147
+ attrib_name ,
145
148
text = False ,
146
149
constants = constants ,
147
150
logger = logger )
148
- if not suc and logger is not None :
149
- logger .warning ("Failed to convert attribute '%s' Got: '%s'" , key , return_dict [key ])
150
-
151
- if parent .text :
152
- # has text, but we don't want all the '\n' s and empty strings in the database
153
- if parent .text .strip () != '' : # might not be the best solutions
154
- if parent .tag not in schema_dict ['text_tags' ]:
155
- if logger is not None :
156
- logger .error ('Something is wrong in the schema_dict: %s is not in text_tags, but it has text' ,
157
- parent .tag )
158
- raise ValueError (
159
- f'Something is wrong in the schema_dict: { parent .tag } is not in text_tags, but it has text' )
160
-
161
- converted_text , suc = convert_from_xml (str (parent .text ),
162
- schema_dict ,
163
- parent .tag ,
164
- text = True ,
165
- constants = constants ,
166
- logger = logger )
167
-
168
151
if not suc and logger is not None :
169
- logger .warning ("Failed to text of '%s' Got: '%s'" , parent .tag , parent .text )
152
+ logger .warning ("Failed to convert attribute '%s' Got: '%s'" , attrib_name , value )
153
+
154
+ # has text, but we don't want all the '\n' s and empty strings in the database
155
+ if parent .text and parent .text .strip () != '' :
156
+
157
+ if parent .tag not in schema_dict ['text_tags' ]:
158
+ if logger is not None :
159
+ logger .error ('Something is wrong in the schema_dict: %s is not in text_tags, but it has text' ,
160
+ parent .tag )
161
+ raise ValueError (
162
+ f'Something is wrong in the schema_dict: { parent .tag } is not in text_tags, but it has text' )
170
163
171
- if not return_dict :
172
- return_dict = converted_text #type:ignore
173
- else :
174
- return_dict ['text_value' ] = converted_text
175
- if 'label' in return_dict :
176
- return_dict ['text_label' ] = return_dict ['label' ]
177
- return_dict .pop ('label' )
164
+ converted_text , suc = convert_from_xml (str (parent .text ),
165
+ schema_dict ,
166
+ parent .tag ,
167
+ text = True ,
168
+ constants = constants ,
169
+ logger = logger )
170
+
171
+ if not suc and logger is not None :
172
+ logger .warning ("Failed to text of '%s' Got: '%s'" , parent .tag , parent .text )
173
+
174
+ content [_TEXT_PLACEHOLDER ] = converted_text
178
175
179
176
tag_info = schema_dict ['tag_info' ].get (base_xpath , EMPTY_TAG_INFO )
180
177
for element in parent :
181
178
182
- new_base_xpath = f'{ base_xpath } /{ element .tag } '
183
- omitt_contained_tags = element .tag in schema_dict ['omitt_contained_tags' ]
184
- new_return_dict = inpxml_todict (element ,
185
- schema_dict ,
186
- constants ,
187
- base_xpath = new_base_xpath ,
188
- omitted_tags = omitt_contained_tags ,
189
- logger = logger )
190
-
191
- if element .tag in tag_info ['several' ]:
192
- # make a list, otherwise the tag will be overwritten in the dict
193
- if element .tag not in return_dict : # is this the first occurrence?
194
- if omitted_tags :
195
- if len (return_dict ) == 0 :
196
- return_dict = [] #type:ignore
197
- else :
198
- return_dict [element .tag ] = []
199
- if omitted_tags :
200
- return_dict .append (new_return_dict ) #type:ignore
201
- elif 'text_value' in new_return_dict :
202
- for key , value in new_return_dict .items ():
203
- if key == 'text_value' :
204
- return_dict [element .tag ].append (value )
205
- elif key == 'text_label' :
206
- if 'labels' not in return_dict :
207
- return_dict ['labels' ] = {}
208
- return_dict ['labels' ][value ] = new_return_dict ['text_value' ]
209
- else :
210
- if key not in return_dict :
211
- return_dict [key ] = []
212
- elif not isinstance (return_dict [key ], list ): #Key seems to be defined already
213
- if logger is not None :
214
- logger .error ('%s cannot be extracted to the next level' , key )
215
- raise ValueError (f'{ key } cannot be extracted to the next level' )
216
- return_dict [key ].append (value )
217
- for key in new_return_dict .keys ():
218
- if key in ['text_value' , 'text_label' ]:
219
- continue
220
- if len (return_dict [key ]) != len (return_dict [element .tag ]):
179
+ child_content = inpxml_todict (element ,
180
+ schema_dict ,
181
+ constants ,
182
+ base_xpath = f'{ base_xpath } /{ element .tag } ' ,
183
+ omitted_tags = element .tag in schema_dict ['omitt_contained_tags' ],
184
+ logger = logger )
185
+
186
+ if _OMIT_PLACEHOLDER in child_content :
187
+ #We knoe that there is only one key here
188
+ child_content = child_content .pop (_OMIT_PLACEHOLDER )
189
+
190
+ tag_name = element .tag
191
+ if omitted_tags :
192
+ tag_name = _OMIT_PLACEHOLDER
193
+
194
+ if element .tag in tag_info ['several' ]\
195
+ and _TEXT_PLACEHOLDER in child_content :
196
+ #The text is stored under the name of the tag
197
+ text_value = child_content .pop (_TEXT_PLACEHOLDER )
198
+ content .setdefault (tag_name , []).append (text_value )
199
+ child_tag_info = schema_dict ['tag_info' ].get (f'{ base_xpath } /{ element .tag } ' , EMPTY_TAG_INFO )
200
+ for key , value in child_content .items ():
201
+ if key not in child_tag_info ['optional_attribs' ]:
202
+ #All required attributes are stored as lists
203
+ if key in content and \
204
+ not isinstance (content [key ], list ): #Key seems to be defined already
221
205
if logger is not None :
222
- logger .error (
223
- 'Extracted optional argument %s at the moment only label is supported correctly' , key )
224
- raise ValueError (
225
- f'Extracted optional argument { key } at the moment only label is supported correctly' )
226
- else :
227
- return_dict [element .tag ].append (new_return_dict )
206
+ logger .error ('%s cannot be extracted to the next level' , key )
207
+ raise ValueError (f'{ key } cannot be extracted to the next level' )
208
+ content .setdefault (key , []).append (value )
209
+ else :
210
+ #All optional attributes are stored as dicts pointing to the text
211
+ content .setdefault (key , {})[value ] = text_value
212
+ elif element .tag in tag_info ['several' ]:
213
+ content .setdefault (tag_name , []).append (child_content )
214
+ elif _TEXT_PLACEHOLDER in child_content :
215
+ content [tag_name ] = child_content .pop (_TEXT_PLACEHOLDER )
228
216
else :
229
- return_dict [ element . tag ] = new_return_dict
217
+ content [ tag_name ] = child_content
230
218
231
- return return_dict
219
+ return content
0 commit comments