3 # The software in this package is distributed under the GNU General
4 # Public License version 2 (with a special exception described below).
6 # A copy of GNU General Public License (GPL) is included in this distribution,
7 # in the file COPYING.GPL.
9 # As a special exception, if other files instantiate templates or use macros
10 # or inline functions from this file, or you compile this file and link it
11 # with other works to produce a work based on this file, this file
12 # does not by itself cause the resulting work to be covered
13 # by the GNU General Public License.
15 # However the source code for this file must still be made available
16 # in accordance with section (3) of the GNU General Public License.
18 # This exception does not invalidate any other reasons why a work based
19 # on this file might be covered by the GNU General Public License.
21 # Copyright (c) 2016-2018 Intra2net AG <info@intra2net.com>
24 Functionality for parsing cnfvars from strings.
26 .. codeauthor:: Intra2net
28 This module provides read and parse functionality for the Intra2net *CNF*
29 format from strings and by extension cnf files.
31 The input string takes one round-trip through the parsers and will error out on
32 problematic lines. Thus, this module can also be used to syntax-check CNF data.
34 Note that line numbers may be arbitrarily reassigned in the process. Of course,
35 parent references and the relative ordering of lines will be preserved in this
39 Decide on some facility for automatic fixup of line number values. The
40 internal representation is recursive so line numbers are not needed to
41 establish a variable hierarchy. They might as well be omitted from the json
42 input and only added when writing cnf. For the time being a lack of the
43 "number" field is interpreted as an error. Though it should at least
44 optionally be possible to omit the numbers entirely and have a function add
45 them after the fact. This might as well be added to :py:func:`is_cnf`
46 though it would be counterintuitive to have a predicate mutate its
47 argument. So maybe it could return a second argument to indicate a valid
48 structure that needs fixup or something like that.
51 Since the original *CNF* format assumes latin-1 encoded data pretty much
52 exclusively, we preserve the original encoding while parsing the file.
53 When assembling the data structures returned to the user, values are then
54 converted to strings so they can be used naturally at the Python end.
64 ###############################################################################
66 ###############################################################################
69 CNF_FIELD_MANDATORY = set ([ "varname", "data", "instance" ])
70 CNF_FIELD_OPTIONAL = set ([ "parent", "children", "comment", "number" ])
71 CNF_FIELD_KNOWN = CNF_FIELD_MANDATORY | CNF_FIELD_OPTIONAL
73 grab_parent_pattern = re.compile(rb"""
82 base_line_pattern = re.compile(rb"""
87 ([A-Z][A-Z0-9_]*) # varname
95 \"( # quoted string (data)
96 (?: \\\" # (of escaped dquote
97 |[^\"])* # or anything not a
102 \s* # optional spaces
103 .* # string (comment)
104 )? # egroup, optional
109 child_line_pattern = re.compile(rb"""
111 \s* # optional spaces
116 ([A-Z][A-Z0-9_]*) # varname
117 \s* # optional spaces
119 \s* # optional spaces
121 \s* # optional spaces
123 \s* # optional spaces
124 \"([^\"]*)\" # quoted string (data)
125 \s* # optional spaces
128 \s* # optional spaces
129 .* # string (comment)
130 )? # egroup, optional
136 ###############################################################################
138 ###############################################################################
142 # Sadly, the Intranator is still stuck with one leg in the 90s.
145 """Take given unicode str and convert it to a latin1-encoded `bytes`."""
146 return s.encode("latin-1")
150 """Take given latin1-encoded `bytes` value and convert it to `str`."""
151 return s.decode("latin-1")
155 # Conversion functions
158 def marshal_in_number(number):
162 def marshal_in_parent(parent):
166 def marshal_in_instance(instance):
170 def marshal_in_varname(varname):
171 return from_latin1(varname).lower()
174 def marshal_in_data(data):
175 return from_latin1(data).replace(r"\"", "\"") if data is not None else ""
178 def marshal_in_comment(comment):
179 return comment and from_latin1(comment[1:].strip()) or None
187 return isinstance(s, str)
190 ###############################################################################
192 ###############################################################################
195 class InvalidCNF(Exception):
197 def __init__(self, msg):
201 return "Malformed CNF_VAR: \"%s\"" % self.msg
204 class MalformedCNF(Exception):
206 def __init__(self, msg):
210 return "Malformed CNF file: \"%s\"" % self.msg
213 ###############################################################################
215 ###############################################################################
227 raise InvalidCNF("CNF_VAR lacks a name.")
228 elif not is_string(varname):
229 raise InvalidCNF("Varname field of CNF_VAR \"%s\" is not a string."
232 raise InvalidCNF("Varname field of CNF_VAR is the empty string.")
234 if comment is not None:
235 if not is_string(comment):
236 raise InvalidCNF("Comment field of CNF_VAR \"%s\" is not a string."
240 raise InvalidCNF("Data field of CNF_VAR \"%s\" is empty."
242 elif not is_string(data):
243 raise InvalidCNF("Data field of CNF_VAR \"%s\" is not a string."
247 raise InvalidCNF("Instance field of CNF_VAR \"%s\" is empty."
249 elif not isinstance(instance, int):
250 raise InvalidCNF("Instance field of CNF_VAR \"%s\" is not an integer."
254 raise InvalidCNF("Number field of CNF_VAR \"%s\" is empty."
256 elif not isinstance(number, int):
257 raise InvalidCNF("Number field of CNF_VAR \"%s\" is not an integer."
260 raise InvalidCNF("Number field of CNF_VAR \"%s\" must be positive, not %d."
263 other = acc.get(number, None)
264 if other is not None: # already in use
265 raise InvalidCNF("Number field of CNF_VAR \"%s\" already used by variable %s."
267 acc[number] = varname
271 raise InvalidCNF("Parent field of nested CNF_VAR \"%s\" is empty."
273 elif not isinstance(parent, int):
274 raise InvalidCNF("Parent field of CNF_VAR \"%s\" is not an integer."
277 if parent is not None:
278 raise InvalidCNF("Flat CNF_VAR \"%s\" has nonsensical parent field \"%s\"."
285 is_cnf -- Predicate testing "CNF_VAR-ness". Folds the :py:func:`is_valid`
286 predicate over the argument which can be either a well-formed CNF
287 dictionary or a list of CNF_VARs.
289 :type root: cnfvar or cnf list
292 Not that if it returns at all, ``is_cnf()`` returns ``True``. Any non
293 well-formed member of the argument will cause the predicate to bail out
294 with an exception during traversal.
298 raise InvalidCNF(root)
299 return walk_cnf(cnf, False, is_valid, {}) is not None
304 Check whether a dictionary is a valid CNF.
306 :param dict obj: dictionary to check
307 :returns: True if the dictionary has all the mandatory fields and no
308 unknown fields, False otherwise
311 assert isinstance (obj, dict)
313 for f in CNF_FIELD_MANDATORY:
314 if obj.get(f, None) is None:
318 if f not in CNF_FIELD_KNOWN:
324 ###############################################################################
326 ###############################################################################
330 # Parsing usually starts from the `read_cnf`, which accepts a string containing
331 # the variables to parse in the same structure as returned by `get_cnf`.
333 # In the `prepare` function the string is split into lines, and a 3-element
334 # tuple is built. The first (named `current`) and second (named `next`)
335 # elements of this tuple are respectively the first and second non-empty lines
336 # of the input, while the third is a list of the remaining lines. This tuple is
337 # named `state` in the implementation below, and it is passed around during
338 # parsing. The `get` and `peek` functions are used to easily retrieve the
339 # `current` and `next` items from the "state".
341 # When we "advance" the state, we actually drop the "current" element,
342 # replacing it with the "next", while a new "next" is popped from the list of
343 # remaining lines. Parsing is done this way because we need to look ahead at
344 # the next line -- if it is a child it needs to be appended to the `children`
345 # property of the current line.
347 # Regular expressions are used to extract important information from the CNF
348 # lines. Finally, once parsing is completed, a dictionary is returned. The dict
349 # has the same structure as the serialized JSON output returned by
356 Read cnf data from data bytes.
358 :param data: raw data
359 :type data: str or bytes
360 :return: the parsed cnf data
361 :rtype: {str, {str, str or int}}
363 if isinstance(data, str):
364 data = to_latin1(data)
365 state = prepare(data)
367 raise InvalidCNF("Empty input string.")
369 cnf = parse_cnf_root(state)
370 if is_cnf(cnf) is False:
371 raise TypeError("Invalid CNF_VAR.")
377 Build 3-element iterable from a CNF string dump.
379 :param raw: string content as returned by `get_cnf`
381 :returns: 3-element tuple, where the first two elements are the first two
382 lines of the output and the third is a list containing the rest
383 of the lines in reverse.
384 :rtype: (str * str option * str list) option
386 lines = raw.splitlines()
398 first = first.strip()
400 return advance((first, second, lines))
402 return (first, second, lines)
407 Pop the next line from the stream, advancing the tuple.
409 :param cns: a 3-element tuple containing two CNF lines and a list of the
411 :type cnd: (str, str, [str])
412 :returns: a new tuple with a new item popped from the list of lines
413 :rtype cnd: (str, str, [str])
415 current, next, stream = cns
416 if next is None: # reached end of stream
426 if current == "": # skip blank lines
427 return advance((current, next, stream))
428 return (current, next, stream)
433 Get the current line from the state without advancing it.
435 :param cns: a 3-element tuple containing two CNF lines and a list of the
437 :type cnd: (str, str, [str])
438 :returns: the CNF line stored as `current`
445 def parse_cnf_root(state):
447 Iterate over and parse a list of CNF lines.
449 :param state: a 3-element tuple containing two lines and a list of the
451 :type state: (str, str, [str])
452 :returns: a list of parsed CNF variables
455 The function will parse the first element from the `state` tuple, then read
456 the next line to see if it is a child variable. If it is, it will be
457 appended to the last parsed CNF, otherwise top-level parsing is done
463 cnf_line = read_base_line(current)
464 if cnf_line is not None:
465 lines.append(cnf_line)
466 state = advance(state)
467 if state is None: # -> nothing left to do
470 parent = get_parent(current) # peek at next line
471 if parent is not None: # -> recurse into children
472 (state, children, _parent) = parse_cnf_children(state, parent)
473 cnf_line["children"] = children
478 state = advance(state)
485 def parse_cnf_children(state, parent):
487 Read and parse child CNFs of a given parent until there is none left.
489 :param state: a 3-element tuple containing two lines and a list of the
491 :type state: (str, str, [str])
492 :param parent: id of the parent whose children we are looking for
494 :returns: a 3-element tuple with the current state, a list of children of
495 the given parent and the parent ID
496 :rtype: (tuple, [str], int)
498 The function will recursively parse child lines from the `state` tuple
499 until one of these conditions is satisfied:
501 1. the input is exhausted
503 2.1. is a toplevel line
504 2.2. is a child line whose parent has a lower parent number
506 Conceptually, 2.1 is a very similar to 2.2 but due to the special status of
507 toplevel lines in CNF we need to handle them separately.
509 Note that since nesting of CNF vars is achieved via parent line numbers,
510 lines with different parents could appear out of order. libcnffile will
511 happily parse those and still assign children to the specified parent:
515 1 USER,1337: "l33t_h4x0r"
516 2 (1) USER_GROUP_MEMBER_REF,0: "2"
517 4 USER,1701: "picard"
518 5 (4) USER_GROUP_MEMBER_REF,0: "2"
519 6 (4) USER_PASSWORD,0: "engage"
520 3 (1) USER_PASSWORD,0: "hacktheplanet"
523 1 USER,1337: "l33t_h4x0r"
524 2 (1) USER_GROUP_MEMBER_REF,0: "2"
525 3 (1) USER_PASSWORD,0: "hacktheplanet"
527 1 USER,1701: "picard"
528 2 (1) USER_GROUP_MEMBER_REF,0: "2"
529 3 (1) USER_PASSWORD,0: "engage"
531 It is a limitation of ``cnfvar.py`` that it cannot parse CNF data
532 structured like the above example: child lists are only populated from
533 subsequent CNF vars using the parent number solely to track nesting levels.
534 The parser does not keep track of line numbers while traversing the input
535 so it doesn’t support retroactively assigning a child to anything else but
536 the immediate parent.
541 cnf_line = read_child_line(current)
542 if cnf_line is not None:
543 lines.append(cnf_line)
544 state = advance(state)
548 new_parent = get_parent(current)
549 if new_parent is None:
551 return (state, lines, None)
552 if new_parent > parent:
553 # parent is further down in hierarchy -> new level
554 (state, children, new_parent) = \
555 parse_cnf_children (state, new_parent)
556 cnf_line["children"] = children
560 new_parent = get_parent(current)
561 if new_parent is None:
563 return (state, lines, None)
564 if new_parent < parent:
565 # parent is further up in hierarchy -> pop level
566 return (state, lines, new_parent)
567 # new_parent == parent -> continue parsing on same level
568 return (state, lines, parent)
571 def get_parent(line):
573 Extract the ID of the parent for a given CNF line.
575 :param str line: CNF line
576 :returns: parent ID or None if no parent is found
579 match = re.match(grab_parent_pattern, line)
580 if match is None: # -> no parent
582 return int(match.groups()[0])
585 def read_base_line(line):
587 Turn one top-level CNF line into a dictionary.
589 :param str line: CNF line
592 This performs the necessary decoding on values to obtain proper Python
593 strings from 8-bit encoded CNF data.
595 The function only operates on individual lines. Argument strings that
596 contain data for multiple lines – this includes child lines of the current
597 CNF var! – will trigger a parsing exception.
599 if len(line.strip()) == 0:
600 return None # ignore empty lines
602 return None # ignore comments
604 match = re.match(base_line_pattern, line)
606 raise MalformedCNF("Syntax error in line \"\"\"%s\"\"\"" % line)
607 number, varname, instance, data, comment = match.groups()
609 "number" : marshal_in_number (number),
610 "varname" : marshal_in_varname (varname),
611 "instance" : marshal_in_instance (instance),
612 "data" : marshal_in_data (data),
613 "comment" : marshal_in_comment (comment),
617 def read_child_line(line):
619 Turn one child CNF line into a dictionary.
621 :param str line: CNF line
624 This function only operates on individual lines. If the argument string is
625 syntactically valid but contains input representing multiple CNF vars, a
626 parse error will be thrown.
628 if len(line.strip()) == 0:
629 return None # ignore empty lines
631 return None # ignore comments
633 match = re.match(child_line_pattern, line)
635 raise MalformedCNF("Syntax error in child line \"\"\"%s\"\"\""
636 % from_latin1 (line))
637 number, parent, varname, instance, data, comment = match.groups()
639 "number" : marshal_in_number (number),
640 "parent" : marshal_in_parent (parent),
641 "varname" : marshal_in_varname (varname),
642 "instance" : marshal_in_instance (instance),
643 "data" : marshal_in_data (data),
644 "comment" : marshal_in_comment (comment),
648 ###############################################################################
650 ###############################################################################
655 Extract a list of CNFs from a given structure.
657 :param root: list of CNFs or a CNF dictionary
658 :type root: [dict] or dict
659 :raises: :py:class:`TypeError` if no CNFs can be extracted
660 :returns: list with one or more CNF objects
663 Output varies depending on a few conditions:
664 - If `root` is a list, return it right away
665 - If `root` is a dict corresponding to a valid CNF value, return it wrapped
667 - If `root` is a dict with a `cnf` key containg a list (as the JSON
668 returned by `get_cnf -j`), return the value
669 - Otherwise, raise an error
671 if isinstance(root, list):
673 if not isinstance(root, dict):
675 "Expected dictionary of CNF_VARs, got %s." % type(root))
678 cnf = root.get("cnf", None)
679 if not isinstance(cnf, list):
680 raise TypeError("Expected list of CNF_VARs, got %s." % type(cnf))
684 ###############################################################################
686 ###############################################################################
689 def walk_cnf(cnf, nested, fun, acc):
691 Depth-first traversal of a CNF tree.
695 :type fun: 'a -> bool -> (cnf stuff) -> 'a
699 Executes ``fun`` recursively for each node in the tree. The function
700 receives the accumulator ``acc`` which can be of an arbitrary type as first
701 argument. The second argument is a flag indicating whether the current
702 CNF var is a child (if ``True``) or a parent var. CNF member fields are
703 passed via named optional arguments.
708 comment=var.get("comment", None),
709 data=var.get("data", None),
710 instance=var.get("instance", None),
711 number=var.get("number", None),
712 parent=var.get("parent", None),
713 varname=var.get("varname", None))
714 children = var.get("children", None)
715 if children is not None:
716 acc = walk_cnf(children, True, fun, acc)