#!/usr/bin/env python # # The software in this package is distributed under the GNU General # Public License version 2 (with a special exception described below). # # A copy of GNU General Public License (GPL) is included in this distribution, # in the file COPYING.GPL. # # As a special exception, if other files instantiate templates or use macros # or inline functions from this file, or you compile this file and link it # with other works to produce a work based on this file, this file # does not by itself cause the resulting work to be covered # by the GNU General Public License. # # However the source code for this file must still be made available # in accordance with section (3) of the GNU General Public License. # # This exception does not invalidate any other reasons why a work based # on this file might be covered by the GNU General Public License. # # Copyright (c) 2016-2018 Intra2net AG """ Functionality for parsing cnfvars from strings. .. codeauthor:: Intra2net This module provides read and parse functionality for the Intra2net *CNF* format from strings and by extension cnf files. The input string takes one round-trip through the parsers and will error out on problematic lines. Thus, this module can also be used to syntax-check CNF data. Note that line numbers may be arbitrarily reassigned in the process. Of course, parent references and the relative ordering of lines will be preserved in this case. .. todo:: Decide on some facility for automatic fixup of line number values. The internal representation is recursive so line numbers are not needed to establish a variable hierarchy. They might as well be omitted from the json input and only added when writing cnf. For the time being a lack of the "number" field is interpreted as an error. Though it should at least optionally be possible to omit the numbers entirely and have a function add them after the fact. This might as well be added to :py:func:`is_cnf` though it would be counterintuitive to have a predicate mutate its argument. So maybe it could return a second argument to indicate a valid structure that needs fixup or something like that. .. note:: Since the original *CNF* format assumes latin-1 encoded data pretty much exclusively, we preserve the original encoding while parsing the file. When assembling the data structures returned to the user, values are then converted to strings so they can be used naturally at the Python end. """ import functools import sys import json import re import io ############################################################################### # CONSTANTS ############################################################################### CNF_FIELD_MANDATORY = set ([ "varname", "data", "instance" ]) CNF_FIELD_OPTIONAL = set ([ "parent", "children", "comment", "number" ]) CNF_FIELD_KNOWN = CNF_FIELD_MANDATORY | CNF_FIELD_OPTIONAL grab_parent_pattern = re.compile(rb""" ^ # match from start \s* # optional spaces \d+ # line number \s+ # spaces \((\d+)\) # parent """, re.VERBOSE) base_line_pattern = re.compile(rb""" ^ # match from start \s* # optional spaces (\d+) # line number \s+ # spaces ([A-Z][A-Z0-9_]*) # varname \s* # optional spaces , # delimiter \s* # optional spaces (-1|\d+) # instance \s* # optional spaces : # delimiter \s* # optional spaces \"( # quoted string (data) (?: \\\" # (of escaped dquote |[^\"])* # or anything not a )\" # literal quote) \s* # optional spaces ( # bgroup \# # comment leader \s* # optional spaces .* # string (comment) )? # egroup, optional $ # eol """, re.VERBOSE) child_line_pattern = re.compile(rb""" ^ # match from start \s* # optional spaces (\d+) # line number \s+ # spaces \((\d+)\) # parent \s+ # spaces ([A-Z][A-Z0-9_]*) # varname \s* # optional spaces , # delimiter \s* # optional spaces (-1|\d+) # instance \s* # optional spaces : # delimiter \s* # optional spaces \"([^\"]*)\" # quoted string (data) \s* # optional spaces ( # bgroup \# # comment leader \s* # optional spaces .* # string (comment) )? # egroup, optional $ # eol """, re.VERBOSE) ############################################################################### # HELPERS ############################################################################### # # Sadly, the Intranator is still stuck with one leg in the 90s. # def to_latin1(s): """Take given unicode str and convert it to a latin1-encoded `bytes`.""" return s.encode("latin-1") def from_latin1(s): """Take given latin1-encoded `bytes` value and convert it to `str`.""" return s.decode("latin-1") # # Conversion functions # def marshal_in_number(number): return int(number) def marshal_in_parent(parent): return int(parent) def marshal_in_instance(instance): return int(instance) def marshal_in_varname(varname): return from_latin1(varname).lower() def marshal_in_data(data): return from_latin1(data).replace(r"\"", "\"") if data is not None else "" def marshal_in_comment(comment): return comment and from_latin1(comment[1:].strip()) or None # # Type checking # def is_string(s): return isinstance(s, str) ############################################################################### # EXCEPTIONS ############################################################################### class InvalidCNF(Exception): def __init__(self, msg): self.msg = msg def __str__(self): return "Malformed CNF_VAR: \"%s\"" % self.msg class MalformedCNF(Exception): def __init__(self, msg): self.msg = msg def __str__(self): return "Malformed CNF file: \"%s\"" % self.msg ############################################################################### # VALIDATION ############################################################################### def is_valid(acc, nested, comment, data, instance, number, parent, varname): if varname is None: raise InvalidCNF("CNF_VAR lacks a name.") elif not is_string(varname): raise InvalidCNF("Varname field of CNF_VAR \"%s\" is not a string." % varname) elif varname == "": raise InvalidCNF("Varname field of CNF_VAR is the empty string.") if comment is not None: if not is_string(comment): raise InvalidCNF("Comment field of CNF_VAR \"%s\" is not a string." % varname) if data is None: raise InvalidCNF("Data field of CNF_VAR \"%s\" is empty." % varname) elif not is_string(data): raise InvalidCNF("Data field of CNF_VAR \"%s\" is not a string." % varname) if instance is None: raise InvalidCNF("Instance field of CNF_VAR \"%s\" is empty." % varname) elif not isinstance(instance, int): raise InvalidCNF("Instance field of CNF_VAR \"%s\" is not an integer." % varname) if number is None: raise InvalidCNF("Number field of CNF_VAR \"%s\" is empty." % varname) elif not isinstance(number, int): raise InvalidCNF("Number field of CNF_VAR \"%s\" is not an integer." % varname) elif number < 1: raise InvalidCNF("Number field of CNF_VAR \"%s\" must be positive, not %d." % (varname, number)) else: other = acc.get(number, None) if other is not None: # already in use raise InvalidCNF("Number field of CNF_VAR \"%s\" already used by variable %s." % (varname, other)) acc[number] = varname if nested is True: if parent is None: raise InvalidCNF("Parent field of nested CNF_VAR \"%s\" is empty." % varname) elif not isinstance(parent, int): raise InvalidCNF("Parent field of CNF_VAR \"%s\" is not an integer." % varname) else: if parent is not None: raise InvalidCNF("Flat CNF_VAR \"%s\" has nonsensical parent field \"%s\"." % (varname, parent)) return acc def is_cnf(root): """ is_cnf -- Predicate testing "CNF_VAR-ness". Folds the :py:func:`is_valid` predicate over the argument which can be either a well-formed CNF dictionary or a list of CNF_VARs. :type root: cnfvar or cnf list :rtype: bool Not that if it returns at all, ``is_cnf()`` returns ``True``. Any non well-formed member of the argument will cause the predicate to bail out with an exception during traversal. """ cnf = cnf_root(root) if cnf is None: raise InvalidCNF(root) return walk_cnf(cnf, False, is_valid, {}) is not None def is_cnf_var(obj): """ Check whether a dictionary is a valid CNF. :param dict obj: dictionary to check :returns: True if the dictionary has all the mandatory fields and no unknown fields, False otherwise :rtype: bool """ assert isinstance (obj, dict) for f in CNF_FIELD_MANDATORY: if obj.get(f, None) is None: return False for f in obj: if f not in CNF_FIELD_KNOWN: return False return True ############################################################################### # DESERIALIZATION ############################################################################### # # CNF reader # # Parsing usually starts from the `read_cnf`, which accepts a string containing # the variables to parse in the same structure as returned by `get_cnf`. # # In the `prepare` function the string is split into lines, and a 3-element # tuple is built. The first (named `current`) and second (named `next`) # elements of this tuple are respectively the first and second non-empty lines # of the input, while the third is a list of the remaining lines. This tuple is # named `state` in the implementation below, and it is passed around during # parsing. The `get` and `peek` functions are used to easily retrieve the # `current` and `next` items from the "state". # # When we "advance" the state, we actually drop the "current" element, # replacing it with the "next", while a new "next" is popped from the list of # remaining lines. Parsing is done this way because we need to look ahead at # the next line -- if it is a child it needs to be appended to the `children` # property of the current line. # # Regular expressions are used to extract important information from the CNF # lines. Finally, once parsing is completed, a dictionary is returned. The dict # has the same structure as the serialized JSON output returned by # `get_cnf -j`. # def read_cnf(data): """ Read cnf data from data bytes. :param data: raw data :type data: str or bytes :return: the parsed cnf data :rtype: {str, {str, str or int}} """ if isinstance(data, str): data = to_latin1(data) state = prepare(data) if state is None: raise InvalidCNF("Empty input string.") cnf = parse_cnf_root(state) if is_cnf(cnf) is False: raise TypeError("Invalid CNF_VAR.") return {"cnf": cnf} def prepare(raw): """ Build 3-element iterable from a CNF string dump. :param raw: string content as returned by `get_cnf` :type raw: bytes :returns: 3-element tuple, where the first two elements are the first two lines of the output and the third is a list containing the rest of the lines in reverse. :rtype: (str * str option * str list) option """ lines = raw.splitlines() lines.reverse() try: first = lines.pop() except IndexError: return None try: second = lines.pop() except IndexError: second = None first = first.strip() if first == b"": return advance((first, second, lines)) return (first, second, lines) def advance(cns): """ Pop the next line from the stream, advancing the tuple. :param cns: a 3-element tuple containing two CNF lines and a list of the remaining lines :type cnd: (str, str, [str]) :returns: a new tuple with a new item popped from the list of lines :rtype cnd: (str, str, [str]) """ current, next, stream = cns if next is None: # reached end of stream return None current = next try: next = stream.pop() next = next.strip() except IndexError: next = None if current == "": # skip blank lines return advance((current, next, stream)) return (current, next, stream) def get(cns): """ Get the current line from the state without advancing it. :param cns: a 3-element tuple containing two CNF lines and a list of the remaining lines :type cnd: (str, str, [str]) :returns: the CNF line stored as `current` :rtype: str """ current, _, _ = cns return current def parse_cnf_root(state): """ Iterate over and parse a list of CNF lines. :param state: a 3-element tuple containing two lines and a list of the remaining lines :type state: (str, str, [str]) :returns: a list of parsed CNF variables :rtype: [dict] The function will parse the first element from the `state` tuple, then read the next line to see if it is a child variable. If it is, it will be appended to the last parsed CNF, otherwise top-level parsing is done normally. """ lines = [] current = get(state) while state: cnf_line = read_base_line(current) if cnf_line is not None: lines.append(cnf_line) state = advance(state) if state is None: # -> nothing left to do break current = get(state) parent = get_parent(current) # peek at next line if parent is not None: # -> recurse into children (state, children, _parent) = parse_cnf_children(state, parent) cnf_line["children"] = children if state is None: break current = get(state) else: state = advance(state) if state is None: break current = get(state) return lines def parse_cnf_children(state, parent): """ Read and parse child CNFs of a given parent until there is none left. :param state: a 3-element tuple containing two lines and a list of the remaining lines :type state: (str, str, [str]) :param parent: id of the parent whose children we are looking for :type parent: int :returns: a 3-element tuple with the current state, a list of children of the given parent and the parent ID :rtype: (tuple, [str], int) The function will recursively parse child lines from the `state` tuple until one of these conditions is satisfied: 1. the input is exhausted 2. the next CNF line 2.1. is a toplevel line 2.2. is a child line whose parent has a lower parent number Conceptually, 2.1 is a very similar to 2.2 but due to the special status of toplevel lines in CNF we need to handle them separately. Note that since nesting of CNF vars is achieved via parent line numbers, lines with different parents could appear out of order. libcnffile will happily parse those and still assign children to the specified parent: :: # set_cnf < parent: # parent is further down in hierarchy -> new level (state, children, new_parent) = \ parse_cnf_children (state, new_parent) cnf_line["children"] = children if state is None: break current = get(state) new_parent = get_parent(current) if new_parent is None: # drop stack return (state, lines, None) if new_parent < parent: # parent is further up in hierarchy -> pop level return (state, lines, new_parent) # new_parent == parent -> continue parsing on same level return (state, lines, parent) def get_parent(line): """ Extract the ID of the parent for a given CNF line. :param str line: CNF line :returns: parent ID or None if no parent is found :rtype: int or None """ match = re.match(grab_parent_pattern, line) if match is None: # -> no parent return None return int(match.groups()[0]) def read_base_line(line): """ Turn one top-level CNF line into a dictionary. :param str line: CNF line :rtype: {str: Any} This performs the necessary decoding on values to obtain proper Python strings from 8-bit encoded CNF data. The function only operates on individual lines. Argument strings that contain data for multiple lines – this includes child lines of the current CNF var! – will trigger a parsing exception. """ if len(line.strip()) == 0: return None # ignore empty lines if line[0] == b"#": return None # ignore comments match = re.match(base_line_pattern, line) if match is None: raise MalformedCNF("Syntax error in line \"\"\"%s\"\"\"" % line) number, varname, instance, data, comment = match.groups() return { "number" : marshal_in_number (number), "varname" : marshal_in_varname (varname), "instance" : marshal_in_instance (instance), "data" : marshal_in_data (data), "comment" : marshal_in_comment (comment), } def read_child_line(line): """ Turn one child CNF line into a dictionary. :param str line: CNF line :rtype: {str: Any} This function only operates on individual lines. If the argument string is syntactically valid but contains input representing multiple CNF vars, a parse error will be thrown. """ if len(line.strip()) == 0: return None # ignore empty lines if line[0] == "#": return None # ignore comments match = re.match(child_line_pattern, line) if match is None: raise MalformedCNF("Syntax error in child line \"\"\"%s\"\"\"" % from_latin1 (line)) number, parent, varname, instance, data, comment = match.groups() return { "number" : marshal_in_number (number), "parent" : marshal_in_parent (parent), "varname" : marshal_in_varname (varname), "instance" : marshal_in_instance (instance), "data" : marshal_in_data (data), "comment" : marshal_in_comment (comment), } ############################################################################### # SERIALIZATION ############################################################################### def cnf_root(root): """ Extract a list of CNFs from a given structure. :param root: list of CNFs or a CNF dictionary :type root: [dict] or dict :raises: :py:class:`TypeError` if no CNFs can be extracted :returns: list with one or more CNF objects :rtype: [dict] Output varies depending on a few conditions: - If `root` is a list, return it right away - If `root` is a dict corresponding to a valid CNF value, return it wrapped in a list - If `root` is a dict with a `cnf` key containg a list (as the JSON returned by `get_cnf -j`), return the value - Otherwise, raise an error """ if isinstance(root, list): return root if not isinstance(root, dict): raise TypeError( "Expected dictionary of CNF_VARs, got %s." % type(root)) if is_cnf_var(root): return [root] cnf = root.get("cnf", None) if not isinstance(cnf, list): raise TypeError("Expected list of CNF_VARs, got %s." % type(cnf)) return cnf ############################################################################### # TRAVERSAL ############################################################################### def walk_cnf(cnf, nested, fun, acc): """ Depth-first traversal of a CNF tree. :type cnf: cnf list :type nested: bool :type fun: 'a -> bool -> (cnf stuff) -> 'a :type acc: 'a :rtype: 'a Executes ``fun`` recursively for each node in the tree. The function receives the accumulator ``acc`` which can be of an arbitrary type as first argument. The second argument is a flag indicating whether the current CNF var is a child (if ``True``) or a parent var. CNF member fields are passed via named optional arguments. """ for var in cnf: acc = fun(acc, nested, comment=var.get("comment", None), data=var.get("data", None), instance=var.get("instance", None), number=var.get("number", None), parent=var.get("parent", None), varname=var.get("varname", None)) children = var.get("children", None) if children is not None: acc = walk_cnf(children, True, fun, acc) return acc