Remove api doc headers
[pyi2ncommon] / src / cnfvar / string.py
CommitLineData
b0075fa2
PD
1#!/usr/bin/env python
2#
3# The software in this package is distributed under the GNU General
4# Public License version 2 (with a special exception described below).
5#
6# A copy of GNU General Public License (GPL) is included in this distribution,
7# in the file COPYING.GPL.
8#
9# As a special exception, if other files instantiate templates or use macros
10# or inline functions from this file, or you compile this file and link it
11# with other works to produce a work based on this file, this file
12# does not by itself cause the resulting work to be covered
13# by the GNU General Public License.
14#
15# However the source code for this file must still be made available
16# in accordance with section (3) of the GNU General Public License.
17#
18# This exception does not invalidate any other reasons why a work based
19# on this file might be covered by the GNU General Public License.
20#
21# Copyright (c) 2016-2018 Intra2net AG <info@intra2net.com>
22
23"""
fcec8a63 24Functionality for parsing cnfvars from strings.
b0075fa2
PD
25
26.. codeauthor:: Intra2net
27
b0075fa2
PD
28This module provides read and parse functionality for the Intra2net *CNF*
29format from strings and by extension cnf files.
30
31The input string takes one round-trip through the parsers and will error out on
32problematic lines. Thus, this module can also be used to syntax-check CNF data.
33
34Note that line numbers may be arbitrarily reassigned in the process. Of course,
35parent references and the relative ordering of lines will be preserved in this
36case.
37
38.. todo::
39 Decide on some facility for automatic fixup of line number values. The
40 internal representation is recursive so line numbers are not needed to
41 establish a variable hierarchy. They might as well be omitted from the json
42 input and only added when writing cnf. For the time being a lack of the
43 "number" field is interpreted as an error. Though it should at least
44 optionally be possible to omit the numbers entirely and have a function add
45 them after the fact. This might as well be added to :py:func:`is_cnf`
46 though it would be counterintuitive to have a predicate mutate its
47 argument. So maybe it could return a second argument to indicate a valid
48 structure that needs fixup or something like that.
49
50.. note::
fcec8a63
CH
51 Since the original *CNF* format assumes latin-1 encoded data pretty much
52 exclusively, we preserve the original encoding while parsing the file.
53 When assembling the data structures returned to the user, values are then
54 converted to strings so they can be used naturally at the Python end.
b0075fa2
PD
55"""
56
57import functools
58import sys
59import json
60import re
61import io
62
63
64###############################################################################
65# CONSTANTS
66###############################################################################
67
68
69CNF_FIELD_MANDATORY = set ([ "varname", "data", "instance" ])
70CNF_FIELD_OPTIONAL = set ([ "parent", "children", "comment", "number" ])
71CNF_FIELD_KNOWN = CNF_FIELD_MANDATORY | CNF_FIELD_OPTIONAL
72
73grab_parent_pattern = re.compile(b"""
74 ^ # match from start
75 \s* # optional spaces
76 \d+ # line number
77 \s+ # spaces
78 \((\d+)\) # parent
79 """,
80 re.VERBOSE)
81
82base_line_pattern = re.compile(b"""
83 ^ # match from start
84 \s* # optional spaces
85 (\d+) # line number
86 \s+ # spaces
87 ([A-Z][A-Z0-9_]*) # varname
88 \s* # optional spaces
89 , # delimiter
90 \s* # optional spaces
91 (-1|\d+) # instance
92 \s* # optional spaces
93 : # delimiter
94 \s* # optional spaces
95 \"( # quoted string (data)
96 (?: \\\" # (of escaped dquote
97 |[^\"])* # or anything not a
98 )\" # literal quote)
99 \s* # optional spaces
100 ( # bgroup
101 \# # comment leader
102 \s* # optional spaces
103 .* # string (comment)
104 )? # egroup, optional
105 $ # eol
106 """,
107 re.VERBOSE)
108
109child_line_pattern = re.compile(b"""
110 ^ # match from start
111 \s* # optional spaces
112 (\d+) # line number
113 \s+ # spaces
114 \((\d+)\) # parent
115 \s+ # spaces
116 ([A-Z][A-Z0-9_]*) # varname
117 \s* # optional spaces
118 , # delimiter
119 \s* # optional spaces
120 (-1|\d+) # instance
121 \s* # optional spaces
122 : # delimiter
123 \s* # optional spaces
124 \"([^\"]*)\" # quoted string (data)
125 \s* # optional spaces
126 ( # bgroup
127 \# # comment leader
128 \s* # optional spaces
129 .* # string (comment)
130 )? # egroup, optional
131 $ # eol
132 """,
133 re.VERBOSE)
134
135
136###############################################################################
137# HELPERS
138###############################################################################
139
140
141#
142# Sadly, the Intranator is still stuck with one leg in the 90s.
143#
144def to_latin1(s):
145 """Take given unicode str and convert it to a latin1-encoded `bytes`."""
146 return s.encode("latin-1")
147
148
149def from_latin1(s):
150 """Take given latin1-encoded `bytes` value and convert it to `str`."""
151 return s.decode("latin-1")
152
153
154#
155# Conversion functions
156#
157
158def marshal_in_number(number):
159 return int(number)
160
161
162def marshal_in_parent(parent):
163 return int(parent)
164
165
166def marshal_in_instance(instance):
167 return int(instance)
168
169
170def marshal_in_varname(varname):
171 return from_latin1(varname).lower()
172
173
174def marshal_in_data(data):
175 return from_latin1(data) if data is not None else ""
176
177
178def marshal_in_comment(comment):
179 return comment and from_latin1(comment[1:].strip()) or None
180
181
182#
183# Type checking
184#
185
186def is_string(s):
187 return isinstance(s, str)
188
189
190###############################################################################
191# EXCEPTIONS
192###############################################################################
193
194
195class InvalidCNF(Exception):
196
197 def __init__(self, msg):
198 self.msg = msg
199
200 def __str__(self):
201 return "Malformed CNF_VAR: \"%s\"" % self.msg
202
203
204class MalformedCNF(Exception):
205
206 def __init__(self, msg):
207 self.msg = msg
208
209 def __str__(self):
210 return "Malformed CNF file: \"%s\"" % self.msg
211
212
213###############################################################################
214# VALIDATION
215###############################################################################
216
217
218def is_valid(acc,
219 nested,
220 comment,
221 data,
222 instance,
223 number,
224 parent,
225 varname):
226 if varname is None:
227 raise InvalidCNF("CNF_VAR lacks a name.")
228 elif not is_string(varname):
229 raise InvalidCNF("Varname field of CNF_VAR \"%s\" is not a string."
230 % varname)
231 elif varname == "":
232 raise InvalidCNF("Varname field of CNF_VAR is the empty string.")
233
234 if comment is not None:
235 if not is_string(comment):
236 raise InvalidCNF("Comment field of CNF_VAR \"%s\" is not a string."
237 % varname)
238
239 if data is None:
240 raise InvalidCNF("Data field of CNF_VAR \"%s\" is empty."
241 % varname)
242 elif not is_string(data):
243 raise InvalidCNF("Data field of CNF_VAR \"%s\" is not a string."
244 % varname)
245
246 if instance is None:
247 raise InvalidCNF("Instance field of CNF_VAR \"%s\" is empty."
248 % varname)
249 elif not isinstance(instance, int):
250 raise InvalidCNF("Instance field of CNF_VAR \"%s\" is not an integer."
251 % varname)
252
253 if number is None:
254 raise InvalidCNF("Number field of CNF_VAR \"%s\" is empty."
255 % varname)
256 elif not isinstance(number, int):
257 raise InvalidCNF("Number field of CNF_VAR \"%s\" is not an integer."
258 % varname)
259 elif number < 1:
260 raise InvalidCNF("Number field of CNF_VAR \"%s\" must be positive, not %d."
261 % (varname, number))
262 else:
263 other = acc.get(number, None)
264 if other is not None: # already in use
265 raise InvalidCNF("Number field of CNF_VAR \"%s\" already used by variable %s."
266 % (varname, other))
267 acc[number] = varname
268
269 if nested is True:
270 if parent is None:
271 raise InvalidCNF("Parent field of nested CNF_VAR \"%s\" is empty."
272 % varname)
273 elif not isinstance(parent, int):
274 raise InvalidCNF("Parent field of CNF_VAR \"%s\" is not an integer."
275 % varname)
276 else:
277 if parent is not None:
278 raise InvalidCNF("Flat CNF_VAR \"%s\" has nonsensical parent field \"%s\"."
279 % (varname, parent))
280 return acc
281
282
283def is_cnf(root):
284 """
285 is_cnf -- Predicate testing "CNF_VAR-ness". Folds the :py:func:`is_valid`
286 predicate over the argument which can be either a well-formed CNF
287 dictionary or a list of CNF_VARs.
288
289 :type root: cnfvar or cnf list
290 :rtype: bool
291
292 Not that if it returns at all, ``is_cnf()`` returns ``True``. Any non
293 well-formed member of the argument will cause the predicate to bail out
294 with an exception during traversal.
295 """
296 cnf = cnf_root(root)
297 if cnf is None:
298 raise InvalidCNF(root)
299 return walk_cnf(cnf, False, is_valid, {}) is not None
300
301
302def is_cnf_var(obj):
303 """
304 Check whether a dictionary is a valid CNF.
305
306 :param dict obj: dictionary to check
307 :returns: True if the dictionary has all the mandatory fields and no
308 unknown fields, False otherwise
309 :rtype: bool
310 """
311 assert isinstance (obj, dict)
312
313 for f in CNF_FIELD_MANDATORY:
314 if obj.get(f, None) is None:
315 return False
316
317 for f in obj:
318 if f not in CNF_FIELD_KNOWN:
319 return False
320
321 return True
322
323
324###############################################################################
325# DESERIALIZATION
326###############################################################################
327#
328# CNF reader
329#
330# Parsing usually starts from the `read_cnf`, which accepts a string containing
331# the variables to parse in the same structure as returned by `get_cnf`.
332#
333# In the `prepare` function the string is split into lines, and a 3-element
334# tuple is built. The first (named `current`) and second (named `next`)
335# elements of this tuple are respectively the first and second non-empty lines
336# of the input, while the third is a list of the remaining lines. This tuple is
337# named `state` in the implementation below, and it is passed around during
338# parsing. The `get` and `peek` functions are used to easily retrieve the
339# `current` and `next` items from the "state".
340#
341# When we "advance" the state, we actually drop the "current" element,
342# replacing it with the "next", while a new "next" is popped from the list of
343# remaining lines. Parsing is done this way because we need to look ahead at
344# the next line -- if it is a child it needs to be appended to the `children`
345# property of the current line.
346#
347# Regular expressions are used to extract important information from the CNF
348# lines. Finally, once parsing is completed, a dictionary is returned. The dict
349# has the same structure as the serialized JSON output returned by
350# `get_cnf -j`.
351#
352
353
354def read_cnf(data):
355 """
356 Read cnf data from data bytes.
357
358 :param data: raw data
359 :type data: str or bytes
360 :return: the parsed cnf data
361 :rtype: {str, {str, str or int}}
362 """
363 if isinstance(data, str):
364 data = to_latin1(data)
365 state = prepare(data)
366 if state is None:
367 raise InvalidCNF("Empty input string.")
368
369 cnf = parse_cnf_root(state)
370 if is_cnf(cnf) is False:
371 raise TypeError("Invalid CNF_VAR.")
372 return {"cnf": cnf}
373
374
375def prepare(raw):
376 """
377 Build 3-element iterable from a CNF string dump.
378
379 :param raw: string content as returned by `get_cnf`
380 :type raw: bytes
381 :returns: 3-element tuple, where the first two elements are the first two
382 lines of the output and the third is a list containing the rest
383 of the lines in reverse.
384 :rtype: (str * str option * str list) option
385 """
386 lines = raw.splitlines()
387 lines.reverse()
388 try:
389 first = lines.pop()
390 except IndexError:
391 return None
392
393 try:
394 second = lines.pop()
395 except IndexError:
396 second = None
397
398 first = first.strip()
399 if first == b"":
400 return advance((first, second, lines))
401
402 return (first, second, lines)
403
404
405def advance(cns):
406 """
407 Pop the next line from the stream, advancing the tuple.
408
409 :param cns: a 3-element tuple containing two CNF lines and a list of the
410 remaining lines
411 :type cnd: (str, str, [str])
412 :returns: a new tuple with a new item popped from the list of lines
413 :rtype cnd: (str, str, [str])
414 """
415 current, next, stream = cns
416 if next is None: # reached end of stream
417 return None
418 current = next
419
420 try:
421 next = stream.pop()
422 next = next.strip()
423 except IndexError:
424 next = None
425
426 if current == "": # skip blank lines
427 return advance((current, next, stream))
428 return (current, next, stream)
429
430
431def get(cns):
432 """
433 Get the current line from the state without advancing it.
434
435 :param cns: a 3-element tuple containing two CNF lines and a list of the
436 remaining lines
437 :type cnd: (str, str, [str])
438 :returns: the CNF line stored as `current`
439 :rtype: str
440 """
441 current, _, _ = cns
442 return current
443
444
445def parse_cnf_root(state):
446 """
447 Iterate over and parse a list of CNF lines.
448
449 :param state: a 3-element tuple containing two lines and a list of the
450 remaining lines
451 :type state: (str, str, [str])
452 :returns: a list of parsed CNF variables
453 :rtype: [dict]
454
455 The function will parse the first element from the `state` tuple, then read
456 the next line to see if it is a child variable. If it is, it will be
457 appended to the last parsed CNF, otherwise top-level parsing is done
458 normally.
459 """
460 lines = []
461 current = get(state)
462 while state:
463 cnf_line = read_base_line(current)
464 if cnf_line is not None:
465 lines.append(cnf_line)
466 state = advance(state)
467 if state is None: # -> nothing left to do
468 break
469 current = get(state)
470 parent = get_parent(current) # peek at next line
471 if parent is not None: # -> recurse into children
472 (state, children, _parent) = parse_cnf_children(state, parent)
473 cnf_line["children"] = children
474 if state is None:
475 break
476 current = get(state)
477 else:
478 state = advance(state)
479 if state is None:
480 break
481 current = get(state)
482 return lines
483
484
485def parse_cnf_children(state, parent):
486 """
487 Read and parse child CNFs of a given parent until there is none left.
488
489 :param state: a 3-element tuple containing two lines and a list of the
490 remaining lines
491 :type state: (str, str, [str])
492 :param parent: id of the parent whose children we are looking for
493 :type parent: int
494 :returns: a 3-element tuple with the current state, a list of children of
495 the given parent and the parent ID
496 :rtype: (tuple, [str], int)
497
498 The function will recursively parse child lines from the `state` tuple
499 until one of these conditions is satisfied:
500
501 1. the input is exhausted
502 2. the next CNF line
503 2.1. is a toplevel line
504 2.2. is a child line whose parent has a lower parent number
505
506 Conceptually, 2.1 is a very similar to 2.2 but due to the special status of
507 toplevel lines in CNF we need to handle them separately.
508
509 Note that since nesting of CNF vars is achieved via parent line numbers,
510 lines with different parents could appear out of order. libcnffile will
511 happily parse those and still assign children to the specified parent:
512
513 ::
514 # set_cnf <<THATSALL
515 1 USER,1337: "l33t_h4x0r"
516 2 (1) USER_GROUP_MEMBER_REF,0: "2"
517 4 USER,1701: "picard"
518 5 (4) USER_GROUP_MEMBER_REF,0: "2"
519 6 (4) USER_PASSWORD,0: "engage"
520 3 (1) USER_PASSWORD,0: "hacktheplanet"
521 THATSALL
522 # get_cnf user 1337
523 1 USER,1337: "l33t_h4x0r"
524 2 (1) USER_GROUP_MEMBER_REF,0: "2"
525 3 (1) USER_PASSWORD,0: "hacktheplanet"
526 # get_cnf user 1701
527 1 USER,1701: "picard"
528 2 (1) USER_GROUP_MEMBER_REF,0: "2"
529 3 (1) USER_PASSWORD,0: "engage"
530
531 It is a limitation of ``cnfvar.py`` that it cannot parse CNF data
532 structured like the above example: child lists are only populated from
533 subsequent CNF vars using the parent number solely to track nesting levels.
534 The parser does not keep track of line numbers while traversing the input
535 so it doesn’t support retroactively assigning a child to anything else but
536 the immediate parent.
537 """
538 lines = []
539 current = get(state)
540 while True:
541 cnf_line = read_child_line(current)
542 if cnf_line is not None:
543 lines.append(cnf_line)
544 state = advance(state)
545 if state is None:
546 break
547 current = get(state)
548 new_parent = get_parent(current)
549 if new_parent is None:
550 # drop stack
551 return (state, lines, None)
552 if new_parent > parent:
553 # parent is further down in hierarchy -> new level
554 (state, children, new_parent) = \
555 parse_cnf_children (state, new_parent)
556 if state is None:
557 break
558 cnf_line["children"] = children
559 current = get(state)
560 new_parent = get_parent(current)
561 if new_parent is None:
562 # drop stack
563 return (state, lines, None)
564 if new_parent < parent:
565 # parent is further up in hierarchy -> pop level
566 return (state, lines, new_parent)
567 # new_parent == parent -> continue parsing on same level
568 return (state, lines, parent)
569
570
571def get_parent(line):
572 """
573 Extract the ID of the parent for a given CNF line.
574
575 :param str line: CNF line
576 :returns: parent ID or None if no parent is found
577 :rtype: int or None
578 """
579 match = re.match(grab_parent_pattern, line)
580 if match is None: # -> no parent
581 return None
582 return int(match.groups()[0])
583
584
585def read_base_line(line):
586 """
587 Turn one top-level CNF line into a dictionary.
588
589 :param str line: CNF line
590 :rtype: {str: Any}
591
592 This performs the necessary decoding on values to obtain proper Python
593 strings from 8-bit encoded CNF data.
594
595 The function only operates on individual lines. Argument strings that
596 contain data for multiple lines – this includes child lines of the current
597 CNF var! – will trigger a parsing exception.
598 """
599 if len(line.strip()) == 0:
600 return None # ignore empty lines
601 if line[0] == b"#":
602 return None # ignore comments
603
604 match = re.match(base_line_pattern, line)
605 if match is None:
606 raise MalformedCNF("Syntax error in line \"\"\"%s\"\"\"" % line)
607 number, varname, instance, data, comment = match.groups()
608 return {
609 "number" : marshal_in_number (number),
610 "varname" : marshal_in_varname (varname),
611 "instance" : marshal_in_instance (instance),
612 "data" : marshal_in_data (data),
613 "comment" : marshal_in_comment (comment),
614 }
615
616
617def read_child_line(line):
618 """
619 Turn one child CNF line into a dictionary.
620
621 :param str line: CNF line
622 :rtype: {str: Any}
623
624 This function only operates on individual lines. If the argument string is
625 syntactically valid but contains input representing multiple CNF vars, a
626 parse error will be thrown.
627 """
628 if len(line.strip()) == 0:
629 return None # ignore empty lines
630 if line[0] == "#":
631 return None # ignore comments
632
633 match = re.match(child_line_pattern, line)
634 if match is None:
635 raise MalformedCNF("Syntax error in child line \"\"\"%s\"\"\""
636 % from_latin1 (line))
637 number, parent, varname, instance, data, comment = match.groups()
638 return {
639 "number" : marshal_in_number (number),
640 "parent" : marshal_in_parent (parent),
641 "varname" : marshal_in_varname (varname),
642 "instance" : marshal_in_instance (instance),
643 "data" : marshal_in_data (data),
644 "comment" : marshal_in_comment (comment),
645 }
646
647
648###############################################################################
649# SERIALIZATION
650###############################################################################
651
652
653def cnf_root(root):
654 """
655 Extract a list of CNFs from a given structure.
656
657 :param root: list of CNFs or a CNF dictionary
658 :type root: [dict] or dict
659 :raises: :py:class:`TypeError` if no CNFs can be extracted
660 :returns: list with one or more CNF objects
661 :rtype: [dict]
662
663 Output varies depending on a few conditions:
df036fbe
CH
664 - If `root` is a list, return it right away
665 - If `root` is a dict corresponding to a valid CNF value, return it wrapped
666 in a list
667 - If `root` is a dict with a `cnf` key containg a list (as the JSON
668 returned by `get_cnf -j`), return the value
669 - Otherwise, raise an error
b0075fa2
PD
670 """
671 if isinstance(root, list):
672 return root
673 if not isinstance(root, dict):
674 raise TypeError(
675 "Expected dictionary of CNF_VARs, got %s." % type(root))
676 if is_cnf_var(root):
677 return [root]
678 cnf = root.get("cnf", None)
679 if not isinstance(cnf, list):
680 raise TypeError("Expected list of CNF_VARs, got %s." % type(cnf))
681 return cnf
682
683
684###############################################################################
685# TRAVERSAL
686###############################################################################
687
688
689def walk_cnf(cnf, nested, fun, acc):
690 """
691 Depth-first traversal of a CNF tree.
692
693 :type cnf: cnf list
694 :type nested: bool
695 :type fun: 'a -> bool -> (cnf stuff) -> 'a
696 :type acc: 'a
697 :rtype: 'a
698
699 Executes ``fun`` recursively for each node in the tree. The function
700 receives the accumulator ``acc`` which can be of an arbitrary type as first
701 argument. The second argument is a flag indicating whether the current
702 CNF var is a child (if ``True``) or a parent var. CNF member fields are
703 passed via named optional arguments.
704 """
705 for var in cnf:
706 acc = fun(acc,
707 nested,
708 comment=var.get("comment", None),
709 data=var.get("data", None),
710 instance=var.get("instance", None),
711 number=var.get("number", None),
712 parent=var.get("parent", None),
713 varname=var.get("varname", None))
714 children = var.get("children", None)
715 if children is not None:
716 acc = walk_cnf(children, True, fun, acc)
717 return acc