3 # The software in this package is distributed under the GNU General
4 # Public License version 2 (with a special exception described below).
6 # A copy of GNU General Public License (GPL) is included in this distribution,
7 # in the file COPYING.GPL.
9 # As a special exception, if other files instantiate templates or use macros
10 # or inline functions from this file, or you compile this file and link it
11 # with other works to produce a work based on this file, this file
12 # does not by itself cause the resulting work to be covered
13 # by the GNU General Public License.
15 # However the source code for this file must still be made available
16 # in accordance with section (3) of the GNU General Public License.
18 # This exception does not invalidate any other reasons why a work based
19 # on this file might be covered by the GNU General Public License.
21 # Copyright (c) 2016-2018 Intra2net AG <info@intra2net.com>
24 string: functionality for parsing cnfvars from strings
26 .. codeauthor:: Intra2net
30 -------------------------------------------------------------------------------
32 This module provides read and parse functionality for the Intra2net *CNF*
33 format from strings and by extension cnf files.
35 The input string takes one round-trip through the parsers and will error out on
36 problematic lines. Thus, this module can also be used to syntax-check CNF data.
38 Note that line numbers may be arbitrarily reassigned in the process. Of course,
39 parent references and the relative ordering of lines will be preserved in this
43 Decide on some facility for automatic fixup of line number values. The
44 internal representation is recursive so line numbers are not needed to
45 establish a variable hierarchy. They might as well be omitted from the json
46 input and only added when writing cnf. For the time being a lack of the
47 "number" field is interpreted as an error. Though it should at least
48 optionally be possible to omit the numbers entirely and have a function add
49 them after the fact. This might as well be added to :py:func:`is_cnf`
50 though it would be counterintuitive to have a predicate mutate its
51 argument. So maybe it could return a second argument to indicate a valid
52 structure that needs fixup or something like that.
55 The variable values of get_cnf seems to be encoded in latin1, and set_cnf
56 seems to assume latin1-encoded values (not var names). Function
57 :py:func:`read_cnf` converts this to unicode and other functions convert
58 unicode back to latin1.
61 notes on Python 3 conversion
62 -------------------------------------------------------------------------------
64 Since the original *CNF* format assumes latin-1 encoded data pretty much
65 exclusively, we preserve the original encoding while parsing the file.
66 When assembling the data structures returned to the user, values are then
67 converted to strings so they can be used naturally at the Python end.
70 -------------------------------------------------------------------------------
80 ###############################################################################
82 ###############################################################################
85 CNF_FIELD_MANDATORY = set ([ "varname", "data", "instance" ])
86 CNF_FIELD_OPTIONAL = set ([ "parent", "children", "comment", "number" ])
87 CNF_FIELD_KNOWN = CNF_FIELD_MANDATORY | CNF_FIELD_OPTIONAL
89 grab_parent_pattern = re.compile(b"""
98 base_line_pattern = re.compile(b"""
100 \s* # optional spaces
103 ([A-Z][A-Z0-9_]*) # varname
104 \s* # optional spaces
106 \s* # optional spaces
108 \s* # optional spaces
110 \s* # optional spaces
111 \"( # quoted string (data)
112 (?: \\\" # (of escaped dquote
113 |[^\"])* # or anything not a
115 \s* # optional spaces
118 \s* # optional spaces
119 .* # string (comment)
120 )? # egroup, optional
125 child_line_pattern = re.compile(b"""
127 \s* # optional spaces
132 ([A-Z][A-Z0-9_]*) # varname
133 \s* # optional spaces
135 \s* # optional spaces
137 \s* # optional spaces
139 \s* # optional spaces
140 \"([^\"]*)\" # quoted string (data)
141 \s* # optional spaces
144 \s* # optional spaces
145 .* # string (comment)
146 )? # egroup, optional
152 ###############################################################################
154 ###############################################################################
158 # Sadly, the Intranator is still stuck with one leg in the 90s.
161 """Take given unicode str and convert it to a latin1-encoded `bytes`."""
162 return s.encode("latin-1")
166 """Take given latin1-encoded `bytes` value and convert it to `str`."""
167 return s.decode("latin-1")
171 # Conversion functions
174 def marshal_in_number(number):
178 def marshal_in_parent(parent):
182 def marshal_in_instance(instance):
186 def marshal_in_varname(varname):
187 return from_latin1(varname).lower()
190 def marshal_in_data(data):
191 return from_latin1(data) if data is not None else ""
194 def marshal_in_comment(comment):
195 return comment and from_latin1(comment[1:].strip()) or None
203 return isinstance(s, str)
206 ###############################################################################
208 ###############################################################################
211 class InvalidCNF(Exception):
213 def __init__(self, msg):
217 return "Malformed CNF_VAR: \"%s\"" % self.msg
220 class MalformedCNF(Exception):
222 def __init__(self, msg):
226 return "Malformed CNF file: \"%s\"" % self.msg
229 ###############################################################################
231 ###############################################################################
243 raise InvalidCNF("CNF_VAR lacks a name.")
244 elif not is_string(varname):
245 raise InvalidCNF("Varname field of CNF_VAR \"%s\" is not a string."
248 raise InvalidCNF("Varname field of CNF_VAR is the empty string.")
250 if comment is not None:
251 if not is_string(comment):
252 raise InvalidCNF("Comment field of CNF_VAR \"%s\" is not a string."
256 raise InvalidCNF("Data field of CNF_VAR \"%s\" is empty."
258 elif not is_string(data):
259 raise InvalidCNF("Data field of CNF_VAR \"%s\" is not a string."
263 raise InvalidCNF("Instance field of CNF_VAR \"%s\" is empty."
265 elif not isinstance(instance, int):
266 raise InvalidCNF("Instance field of CNF_VAR \"%s\" is not an integer."
270 raise InvalidCNF("Number field of CNF_VAR \"%s\" is empty."
272 elif not isinstance(number, int):
273 raise InvalidCNF("Number field of CNF_VAR \"%s\" is not an integer."
276 raise InvalidCNF("Number field of CNF_VAR \"%s\" must be positive, not %d."
279 other = acc.get(number, None)
280 if other is not None: # already in use
281 raise InvalidCNF("Number field of CNF_VAR \"%s\" already used by variable %s."
283 acc[number] = varname
287 raise InvalidCNF("Parent field of nested CNF_VAR \"%s\" is empty."
289 elif not isinstance(parent, int):
290 raise InvalidCNF("Parent field of CNF_VAR \"%s\" is not an integer."
293 if parent is not None:
294 raise InvalidCNF("Flat CNF_VAR \"%s\" has nonsensical parent field \"%s\"."
301 is_cnf -- Predicate testing "CNF_VAR-ness". Folds the :py:func:`is_valid`
302 predicate over the argument which can be either a well-formed CNF
303 dictionary or a list of CNF_VARs.
305 :type root: cnfvar or cnf list
308 Not that if it returns at all, ``is_cnf()`` returns ``True``. Any non
309 well-formed member of the argument will cause the predicate to bail out
310 with an exception during traversal.
314 raise InvalidCNF(root)
315 return walk_cnf(cnf, False, is_valid, {}) is not None
320 Check whether a dictionary is a valid CNF.
322 :param dict obj: dictionary to check
323 :returns: True if the dictionary has all the mandatory fields and no
324 unknown fields, False otherwise
327 assert isinstance (obj, dict)
329 for f in CNF_FIELD_MANDATORY:
330 if obj.get(f, None) is None:
334 if f not in CNF_FIELD_KNOWN:
340 ###############################################################################
342 ###############################################################################
346 # Parsing usually starts from the `read_cnf`, which accepts a string containing
347 # the variables to parse in the same structure as returned by `get_cnf`.
349 # In the `prepare` function the string is split into lines, and a 3-element
350 # tuple is built. The first (named `current`) and second (named `next`)
351 # elements of this tuple are respectively the first and second non-empty lines
352 # of the input, while the third is a list of the remaining lines. This tuple is
353 # named `state` in the implementation below, and it is passed around during
354 # parsing. The `get` and `peek` functions are used to easily retrieve the
355 # `current` and `next` items from the "state".
357 # When we "advance" the state, we actually drop the "current" element,
358 # replacing it with the "next", while a new "next" is popped from the list of
359 # remaining lines. Parsing is done this way because we need to look ahead at
360 # the next line -- if it is a child it needs to be appended to the `children`
361 # property of the current line.
363 # Regular expressions are used to extract important information from the CNF
364 # lines. Finally, once parsing is completed, a dictionary is returned. The dict
365 # has the same structure as the serialized JSON output returned by
372 Read cnf data from data bytes.
374 :param data: raw data
375 :type data: str or bytes
376 :return: the parsed cnf data
377 :rtype: {str, {str, str or int}}
379 if isinstance(data, str):
380 data = to_latin1(data)
381 state = prepare(data)
383 raise InvalidCNF("Empty input string.")
385 cnf = parse_cnf_root(state)
386 if is_cnf(cnf) is False:
387 raise TypeError("Invalid CNF_VAR.")
393 Build 3-element iterable from a CNF string dump.
395 :param raw: string content as returned by `get_cnf`
397 :returns: 3-element tuple, where the first two elements are the first two
398 lines of the output and the third is a list containing the rest
399 of the lines in reverse.
400 :rtype: (str * str option * str list) option
402 lines = raw.splitlines()
414 first = first.strip()
416 return advance((first, second, lines))
418 return (first, second, lines)
423 Pop the next line from the stream, advancing the tuple.
425 :param cns: a 3-element tuple containing two CNF lines and a list of the
427 :type cnd: (str, str, [str])
428 :returns: a new tuple with a new item popped from the list of lines
429 :rtype cnd: (str, str, [str])
431 current, next, stream = cns
432 if next is None: # reached end of stream
442 if current == "": # skip blank lines
443 return advance((current, next, stream))
444 return (current, next, stream)
449 Get the current line from the state without advancing it.
451 :param cns: a 3-element tuple containing two CNF lines and a list of the
453 :type cnd: (str, str, [str])
454 :returns: the CNF line stored as `current`
461 def parse_cnf_root(state):
463 Iterate over and parse a list of CNF lines.
465 :param state: a 3-element tuple containing two lines and a list of the
467 :type state: (str, str, [str])
468 :returns: a list of parsed CNF variables
471 The function will parse the first element from the `state` tuple, then read
472 the next line to see if it is a child variable. If it is, it will be
473 appended to the last parsed CNF, otherwise top-level parsing is done
479 cnf_line = read_base_line(current)
480 if cnf_line is not None:
481 lines.append(cnf_line)
482 state = advance(state)
483 if state is None: # -> nothing left to do
486 parent = get_parent(current) # peek at next line
487 if parent is not None: # -> recurse into children
488 (state, children, _parent) = parse_cnf_children(state, parent)
489 cnf_line["children"] = children
494 state = advance(state)
501 def parse_cnf_children(state, parent):
503 Read and parse child CNFs of a given parent until there is none left.
505 :param state: a 3-element tuple containing two lines and a list of the
507 :type state: (str, str, [str])
508 :param parent: id of the parent whose children we are looking for
510 :returns: a 3-element tuple with the current state, a list of children of
511 the given parent and the parent ID
512 :rtype: (tuple, [str], int)
514 The function will recursively parse child lines from the `state` tuple
515 until one of these conditions is satisfied:
517 1. the input is exhausted
519 2.1. is a toplevel line
520 2.2. is a child line whose parent has a lower parent number
522 Conceptually, 2.1 is a very similar to 2.2 but due to the special status of
523 toplevel lines in CNF we need to handle them separately.
525 Note that since nesting of CNF vars is achieved via parent line numbers,
526 lines with different parents could appear out of order. libcnffile will
527 happily parse those and still assign children to the specified parent:
531 1 USER,1337: "l33t_h4x0r"
532 2 (1) USER_GROUP_MEMBER_REF,0: "2"
533 4 USER,1701: "picard"
534 5 (4) USER_GROUP_MEMBER_REF,0: "2"
535 6 (4) USER_PASSWORD,0: "engage"
536 3 (1) USER_PASSWORD,0: "hacktheplanet"
539 1 USER,1337: "l33t_h4x0r"
540 2 (1) USER_GROUP_MEMBER_REF,0: "2"
541 3 (1) USER_PASSWORD,0: "hacktheplanet"
543 1 USER,1701: "picard"
544 2 (1) USER_GROUP_MEMBER_REF,0: "2"
545 3 (1) USER_PASSWORD,0: "engage"
547 It is a limitation of ``cnfvar.py`` that it cannot parse CNF data
548 structured like the above example: child lists are only populated from
549 subsequent CNF vars using the parent number solely to track nesting levels.
550 The parser does not keep track of line numbers while traversing the input
551 so it doesn’t support retroactively assigning a child to anything else but
552 the immediate parent.
557 cnf_line = read_child_line(current)
558 if cnf_line is not None:
559 lines.append(cnf_line)
560 state = advance(state)
564 new_parent = get_parent(current)
565 if new_parent is None:
567 return (state, lines, None)
568 if new_parent > parent:
569 # parent is further down in hierarchy -> new level
570 (state, children, new_parent) = \
571 parse_cnf_children (state, new_parent)
574 cnf_line["children"] = children
576 new_parent = get_parent(current)
577 if new_parent is None:
579 return (state, lines, None)
580 if new_parent < parent:
581 # parent is further up in hierarchy -> pop level
582 return (state, lines, new_parent)
583 # new_parent == parent -> continue parsing on same level
584 return (state, lines, parent)
587 def get_parent(line):
589 Extract the ID of the parent for a given CNF line.
591 :param str line: CNF line
592 :returns: parent ID or None if no parent is found
595 match = re.match(grab_parent_pattern, line)
596 if match is None: # -> no parent
598 return int(match.groups()[0])
601 def read_base_line(line):
603 Turn one top-level CNF line into a dictionary.
605 :param str line: CNF line
608 This performs the necessary decoding on values to obtain proper Python
609 strings from 8-bit encoded CNF data.
611 The function only operates on individual lines. Argument strings that
612 contain data for multiple lines – this includes child lines of the current
613 CNF var! – will trigger a parsing exception.
615 if len(line.strip()) == 0:
616 return None # ignore empty lines
618 return None # ignore comments
620 match = re.match(base_line_pattern, line)
622 raise MalformedCNF("Syntax error in line \"\"\"%s\"\"\"" % line)
623 number, varname, instance, data, comment = match.groups()
625 "number" : marshal_in_number (number),
626 "varname" : marshal_in_varname (varname),
627 "instance" : marshal_in_instance (instance),
628 "data" : marshal_in_data (data),
629 "comment" : marshal_in_comment (comment),
633 def read_child_line(line):
635 Turn one child CNF line into a dictionary.
637 :param str line: CNF line
640 This function only operates on individual lines. If the argument string is
641 syntactically valid but contains input representing multiple CNF vars, a
642 parse error will be thrown.
644 if len(line.strip()) == 0:
645 return None # ignore empty lines
647 return None # ignore comments
649 match = re.match(child_line_pattern, line)
651 raise MalformedCNF("Syntax error in child line \"\"\"%s\"\"\""
652 % from_latin1 (line))
653 number, parent, varname, instance, data, comment = match.groups()
655 "number" : marshal_in_number (number),
656 "parent" : marshal_in_parent (parent),
657 "varname" : marshal_in_varname (varname),
658 "instance" : marshal_in_instance (instance),
659 "data" : marshal_in_data (data),
660 "comment" : marshal_in_comment (comment),
664 ###############################################################################
666 ###############################################################################
671 Extract a list of CNFs from a given structure.
673 :param root: list of CNFs or a CNF dictionary
674 :type root: [dict] or dict
675 :raises: :py:class:`TypeError` if no CNFs can be extracted
676 :returns: list with one or more CNF objects
679 Output varies depending on a few conditions:
680 - If `root` is a list, return it right away
681 - If `root` is a dict corresponding to a valid CNF value, return it wrapped
683 - If `root` is a dict with a `cnf` key containg a list (as the JSON
684 returned by `get_cnf -j`), return the value
685 - Otherwise, raise an error
687 if isinstance(root, list):
689 if not isinstance(root, dict):
691 "Expected dictionary of CNF_VARs, got %s." % type(root))
694 cnf = root.get("cnf", None)
695 if not isinstance(cnf, list):
696 raise TypeError("Expected list of CNF_VARs, got %s." % type(cnf))
700 ###############################################################################
702 ###############################################################################
705 def walk_cnf(cnf, nested, fun, acc):
707 Depth-first traversal of a CNF tree.
711 :type fun: 'a -> bool -> (cnf stuff) -> 'a
715 Executes ``fun`` recursively for each node in the tree. The function
716 receives the accumulator ``acc`` which can be of an arbitrary type as first
717 argument. The second argument is a flag indicating whether the current
718 CNF var is a child (if ``True``) or a parent var. CNF member fields are
719 passed via named optional arguments.
724 comment=var.get("comment", None),
725 data=var.get("data", None),
726 instance=var.get("instance", None),
727 number=var.get("number", None),
728 parent=var.get("parent", None),
729 varname=var.get("varname", None))
730 children = var.get("children", None)
731 if children is not None:
732 acc = walk_cnf(children, True, fun, acc)