[pyi2ncommon] / src / log_read.py

# The software in this package is distributed under the GNU General
# Public License version 2 (with a special exception described below).
#
# A copy of GNU General Public License (GPL) is included in this distribution,
# in the file COPYING.GPL.
#
# As a special exception, if other files instantiate templates or use macros
# or inline functions from this file, or you compile this file and link it
# with other works to produce a work based on this file, this file
# does not by itself cause the resulting work to be covered
# by the GNU General Public License.
#
# However the source code for this file must still be made available
# in accordance with section (3) of the GNU General Public License.
#
# This exception does not invalidate any other reasons why a work based
# on this file might be covered by the GNU General Public License.
#
# Copyright (c) 2016-2018 Intra2net AG <info@intra2net.com>

""" Iterative reading of log files

Basic Functionality (class :py:class:`IterativeReader`):
Runs stat in a loop to find out whether file size has changed. Then reads the
new data and forwards that

.. todo:: Want to also use lsof to find out whether file/pipe/socket was closed,
         so can return from read loop

:py:class:`LineReader` takes output of :py:class:`IterativeReader` and returns
it line-wise as is normal for log files

:py:class:`LogParser` takes those lines and tries to parse them into fields
like date, time, module name, urgency and message.

.. todo:: auto-detect log line layout
"""

import os
import re
from warnings import warn
import os.path
import logging
from .iter_helpers import zip_longest
from .type_helpers import is_str_or_byte, is_file_obj


class LogReadWarning(UserWarning):
    """ warnings issued by classes in this module """
    pass


def true_func(unused_argument_but_that_is_ok):
    """ does nothing, always returns True """
    return True


def check_is_used(some_file_or_handle):
    """ check whether file is being written to

    to be implemented, e.g. using lsof
    """
    raise NotImplementedError()


_create_description_unknown_counter = 0


def create_description(file_obj, file_desc):
    """ create some description for given file-like object / file descriptor

    :param file_obj: file-like object
    :param int file_desc: os-level file descriptor
    :returns: string
    """

    global _create_description_unknown_counter

    try:
        desc = file_obj.name
        if desc:
            return desc
    except AttributeError:
        pass

    if file_desc is not None:
        return 'file{0}'.format(file_desc)
    else:
        _create_description_unknown_counter += 1
        return 'unknown{0}'.format(_create_description_unknown_counter)


#: error message for IterativeReader constructor
_STR_ERR = 'not accepting file name "{0}" since cannot guarantee closing ' \
           'files --> use with open(file_name)!'


class IterativeReader(object):
    """ reads from a given file

    Uses os.stat(file_obj.fileno()).st_size as measure whether file has changed
    or not; Always reads as much data as possible

    Catches most common exceptions in iteration (not constructor)

    Does not care about closing files, so does not accept file names

    This is the base for class :py:class:`LineReader` that just has to
    implement a different :py:meth:`prepare_result` method
    """

    def __init__(self, sources, descs=None, return_when_done=False):
        """ creates a reader; does some basic checks on args

        :param sources: iterable over sources. Sources can be opened file
                        objects or read-opened os-level file descriptors.
                        Calling code has to ensure they are closed properly, so
                        best use this within a "with open(file_name) as
                        file_handle:"-context. If sources is a single file
                        obj/descriptor, both source and desc will be converted
                        to lists of length 1
        :param descs: can be anything of same length as sources. If sources is
                      a single source, then descs is also converted to a list
                      of length 1. If not given (i.e. None), will use
                      :py:func:`create_description` to guess descriptions
        :param bool return_when_done: ignore file_handle if no-one is writing
                                      to it any more. Return from iterator when
                                      all watched files are done (not
                                      implemented yet)
        :raises: OSError when testing fstat on source
        """
        if not sources:
            raise ValueError('need at least some source!')
        elif is_str_or_byte(sources):
            raise ValueError(_STR_ERR.format(sources))
        elif is_file_obj(sources) or isinstance(sources, int):
            source_input = [sources, ]
            desc_input = [descs, ]
        else:
            source_input = sources  # assume some iterable
            desc_input = descs

        # now divide sources into os-level file descriptors for os.fstat,
        # and file objects for read()
        self.file_objs = []
        self.file_descs = []          # file descriptOR, not descriptION
        for source in source_input:
            if is_file_obj(source):
                self.file_objs.append(source)
                self.file_descs.append(source.fileno())
            elif isinstance(source, int):
                self.file_objs.append(os.fdopen(source))
                self.file_descs.append(source)
            elif is_str_or_byte(source):
                raise ValueError(_STR_ERR.format(source))
            else:
                raise ValueError('source {0} is neither file obj nor file '
                                 'descriptor!')

            # try to fstat the new file descriptor just for testing
            os.fstat(self.file_descs[-1])

        # guess descriptions if not given
        if not desc_input:
            self.descriptions = [create_description(obj, file_desc)
                                 for obj, file_desc
                                 in zip(self.file_objs, self.file_descs)]
        else:
            try:
                if len(desc_input) != len(self.file_objs):
                    raise ValueError('need same number of sources and '
                                     'descriptions!')
            except TypeError:
                pass  # desc_input is generator or so

            self.descriptions = []
            for obj, file_desc, description in \
                    zip_longest(self.file_objs, self.file_descs, desc_input):
                if obj is None:
                    raise ValueError('more descriptions than sources!')
                elif description is None:
                    self.descriptions.append(create_description(obj,
                                                                file_desc))
                else:
                    self.descriptions.append(description)

        self.last_sizes = [0 for _ in self.file_objs]
        self.ignore = [False for _ in self.file_objs]

        if return_when_done:
            self.is_used = check_is_used
        else:
            self.is_used = true_func

        for obj, file_desc, description in \
                zip(self.file_objs, self.file_descs, self.descriptions):
            logging.debug('log_read initialized with file descriptor {0}, '
                          'file obj {1}, description "{2}"'
                          .format(file_desc, obj, description))

    def n_sources(self):
        return len(self.file_objs)

    def n_active_sources(self):
        return len(self.ignore) - sum(self.ignore)

    def __iter__(self):
        while True:
            for idx, (obj, file_desc, description, last_size, do_ignore) in \
                    enumerate(zip(self.file_objs, self.file_descs,
                                  self.descriptions, self.last_sizes,
                                  self.ignore)):
                if do_ignore:
                    continue

                # get new file size
                new_size = os.fstat(file_desc).st_size

                # compare to old size
                if new_size == last_size:
                    if not self.is_used(file_desc):
                        warn('no one is writing to {0} / {1} -- '
                             'stop watching it!'
                             .format(file_desc, description),
                             category=LogReadWarning)
                        self.ignore[idx] = True
                else:
                    if new_size < last_size:  # happened at start of some tests
                        warn('{0} / {1} has become smaller ({2} --> {3})! '
                             .format(obj, description, last_size, new_size)
                             + 'Maybe you are reading from a half-initialized '
                             + 'file?',
                             category=LogReadWarning)
                    try:
                        new_data = obj.read()
                    except OSError as ose:    # includes IOErrors
                        warn('io error reading from {0} / {1}: {2})'
                             .format(obj, description, ose),
                             category=LogReadWarning)
                        new_data = str(ose)
                    except UnicodeDecodeError as ude:
                        warn('unicode error reading from {0} / {1}: {2}'
                             .format(obj, description, ude),
                             category=LogReadWarning)
                        new_data = str(ude)

                    # post-processing
                    to_yield = self.prepare_result(description, new_data, idx)
                    for result in to_yield:
                        yield result

                    # prepare next iteration
                    self.last_sizes[idx] = new_size

    def prepare_result(self, description, data, idx):
        """ from raw new data create some yield-able results

        to be intended for overwriting in sub-classes

        this function is called from __iter__ for each new data that becomes
        available. It has to return some iterable whose entries are yielded
        from iteration over objects of this class.

        It receives the following args:
        - the description of the source
        - the data itself
        - the index of the source
        The result must be an iterable of objects, which are yielded as-is, so
        can have any form

        This base implementation just returns its input in a list, so new data
        is yielded from __iter__ as-is
        """
        return [(description, data), ]


LINE_SPLITTERS = '\n\r'


class LineReader(IterativeReader):
    """ an IterativeReader that returns new data line-wise

    this means buffering partial line data
    """

    def __init__(self, *args, **kwargs):
        """ forwards all args and kwargs to :py:class:`IterativeReader` """
        super(LineReader, self).__init__(*args, **kwargs)
        self.line_buffers = ['' for _ in range(self.n_sources())]

    def prepare_result(self, description, new_data, idx):
        """ take raw new data and split it into lines

        if line is not complete, then buffer it

        returns lines without their newline characters
        """
        all_data = self.line_buffers[idx] + new_data
        self.line_buffers[idx] = ''
        result = []
        should_be_no_new_lines = False
        for line in all_data.splitlines(True):
            if line[-1] in LINE_SPLITTERS:
                result.append((description, line.rstrip(LINE_SPLITTERS)))
            elif should_be_no_new_lines:
                raise ValueError('line splitters are not compatible with'
                                 'str.splitlines!')
            else:
                self.line_buffers[idx] = line
                should_be_no_new_lines = True  # (this should be the last)

        return result


class LogParser(LineReader):
    """ takes lines from LineReader and parses their contents

    requires a pattern for log lines, auto-detection is not implemented yet

    Iteration returns re.match result or -- if matching failed -- the original
    raw line
    """

    def __init__(self, log_file, pattern=None):
        """ create a LogParser

        :param str log_file: name of log file to parse (required!)
        :param pattern: regexp to split log lines; None (default) to return
                        line as they are
        """
        super(LogParser, self).__init__(log_file)

        self.pattern = pattern

    def prepare_result(self, *args):
        # let super class split data into lines
        for _, raw_line in super(LogParser, self).prepare_result(*args):
            result = re.match(self.pattern, raw_line)
            if result:
                yield result
            else:
                yield raw_line
Commit	Line	Data
3237d2a6 CH	1	# The software in this package is distributed under the GNU General
	2	# Public License version 2 (with a special exception described below).
	3	#
	4	# A copy of GNU General Public License (GPL) is included in this distribution,
	5	# in the file COPYING.GPL.
	6	#
	7	# As a special exception, if other files instantiate templates or use macros
	8	# or inline functions from this file, or you compile this file and link it
	9	# with other works to produce a work based on this file, this file
	10	# does not by itself cause the resulting work to be covered
	11	# by the GNU General Public License.
	12	#
	13	# However the source code for this file must still be made available
	14	# in accordance with section (3) of the GNU General Public License.
	15	#
	16	# This exception does not invalidate any other reasons why a work based
	17	# on this file might be covered by the GNU General Public License.
f365f614 CH	18	#
f365f614 CH	19	# Copyright (c) 2016-2018 Intra2net AG <info@intra2net.com>
3237d2a6	20
e7d49180 CH	21	""" Iterative reading of log files
e7d49180 CH	22
7c362208	23	Basic Functionality (class :py:class:`IterativeReader`):
e7d49180 CH	24	Runs stat in a loop to find out whether file size has changed. Then reads the
	25	new data and forwards that
	26
2713b352	27	.. todo:: Want to also use lsof to find out whether file/pipe/socket was closed,
e7d49180 CH	28	so can return from read loop
	29
	30	:py:class:`LineReader` takes output of :py:class:`IterativeReader` and returns
	31	it line-wise as is normal for log files
	32
	33	:py:class:`LogParser` takes those lines and tries to parse them into fields
	34	like date, time, module name, urgency and message.
	35
2713b352	36	.. todo:: auto-detect log line layout
e7d49180 CH	37	"""
	38
	39	import os
445afb23	40	import re
e7d49180 CH	41	from warnings import warn
e7d49180 CH	42	import os.path
77a07d09	43	import logging
1242b1cf	44	from .iter_helpers import zip_longest
1242b1cf	45	from .type_helpers import is_str_or_byte, is_file_obj
e7d49180 CH	46
	47
	48	class LogReadWarning(UserWarning):
	49	""" warnings issued by classes in this module """
	50	pass
	51
	52
	53	def true_func(unused_argument_but_that_is_ok):
	54	""" does nothing, always returns True """
	55	return True
	56
	57
	58	def check_is_used(some_file_or_handle):
	59	""" check whether file is being written to
	60
	61	to be implemented, e.g. using lsof
	62	"""
	63	raise NotImplementedError()
	64
	65
	66	_create_description_unknown_counter = 0
	67
acc472a4	68
e7d49180 CH	69	def create_description(file_obj, file_desc):
	70	""" create some description for given file-like object / file descriptor
	71
	72	:param file_obj: file-like object
	73	:param int file_desc: os-level file descriptor
	74	:returns: string
	75	"""
	76
	77	global _create_description_unknown_counter
	78
	79	try:
	80	desc = file_obj.name
	81	if desc:
	82	return desc
	83	except AttributeError:
	84	pass
	85
	86	if file_desc is not None:
	87	return 'file{0}'.format(file_desc)
	88	else:
	89	_create_description_unknown_counter += 1
	90	return 'unknown{0}'.format(_create_description_unknown_counter)
	91
	92
	93	#: error message for IterativeReader constructor
	94	_STR_ERR = 'not accepting file name "{0}" since cannot guarantee closing ' \
	95	'files --> use with open(file_name)!'
	96
	97
879f0150	98	class IterativeReader(object):
e7d49180 CH	99	""" reads from a given file
	100
	101	Uses os.stat(file_obj.fileno()).st_size as measure whether file has changed
	102	or not; Always reads as much data as possible
	103
	104	Catches most common exceptions in iteration (not constructor)
	105
7c362208	106	Does not care about closing files, so does not accept file names
e7d49180 CH	107
	108	This is the base for class :py:class:`LineReader` that just has to
	109	implement a different :py:meth:`prepare_result` method
	110	"""
	111
	112	def __init__(self, sources, descs=None, return_when_done=False):
	113	""" creates a reader; does some basic checks on args
	114
	115	:param sources: iterable over sources. Sources can be opened file
	116	objects or read-opened os-level file descriptors.
	117	Calling code has to ensure they are closed properly, so
	118	best use this within a "with open(file_name) as
	119	file_handle:"-context. If sources is a single file
	120	obj/descriptor, both source and desc will be converted
	121	to lists of length 1
	122	:param descs: can be anything of same length as sources. If sources is
	123	a single source, then descs is also converted to a list
	124	of length 1. If not given (i.e. None), will use
	125	:py:func:`create_description` to guess descriptions
	126	:param bool return_when_done: ignore file_handle if no-one is writing
	127	to it any more. Return from iterator when
	128	all watched files are done (not
	129	implemented yet)
	130	:raises: OSError when testing fstat on source
	131	"""
	132	if not sources:
	133	raise ValueError('need at least some source!')
	134	elif is_str_or_byte(sources):
	135	raise ValueError(_STR_ERR.format(sources))
	136	elif is_file_obj(sources) or isinstance(sources, int):
	137	source_input = [sources, ]
	138	desc_input = [descs, ]
	139	else:
	140	source_input = sources # assume some iterable
	141	desc_input = descs
	142
	143	# now divide sources into os-level file descriptors for os.fstat,
	144	# and file objects for read()
	145	self.file_objs = []
	146	self.file_descs = [] # file descriptOR, not descriptION
	147	for source in source_input:
	148	if is_file_obj(source):
	149	self.file_objs.append(source)
	150	self.file_descs.append(source.fileno())
	151	elif isinstance(source, int):
	152	self.file_objs.append(os.fdopen(source))
	153	self.file_descs.append(source)
	154	elif is_str_or_byte(source):
	155	raise ValueError(_STR_ERR.format(source))
	156	else:
	157	raise ValueError('source {0} is neither file obj nor file '
	158	'descriptor!')
	159
d910eba5	160	# try to fstat the new file descriptor just for testing
e7d49180 CH	161	os.fstat(self.file_descs[-1])
	162
	163	# guess descriptions if not given
	164	if not desc_input:
	165	self.descriptions = [create_description(obj, file_desc)
	166	for obj, file_desc
	167	in zip(self.file_objs, self.file_descs)]
	168	else:
	169	try:
	170	if len(desc_input) != len(self.file_objs):
	171	raise ValueError('need same number of sources and '
	172	'descriptions!')
	173	except TypeError:
	174	pass # desc_input is generator or so
	175
	176	self.descriptions = []
	177	for obj, file_desc, description in \
	178	zip_longest(self.file_objs, self.file_descs, desc_input):
	179	if obj is None:
	180	raise ValueError('more descriptions than sources!')
	181	elif description is None:
	182	self.descriptions.append(create_description(obj,
	183	file_desc))
	184	else:
	185	self.descriptions.append(description)
	186
	187	self.last_sizes = [0 for _ in self.file_objs]
	188	self.ignore = [False for _ in self.file_objs]
	189
	190	if return_when_done:
	191	self.is_used = check_is_used
	192	else:
	193	self.is_used = true_func
	194
77a07d09 CH	195	for obj, file_desc, description in \
	196	zip(self.file_objs, self.file_descs, self.descriptions):
	197	logging.debug('log_read initialized with file descriptor {0}, '
	198	'file obj {1}, description "{2}"'
	199	.format(file_desc, obj, description))
e7d49180 CH	200
	201	def n_sources(self):
	202	return len(self.file_objs)
	203
	204	def n_active_sources(self):
	205	return len(self.ignore) - sum(self.ignore)
	206
	207	def __iter__(self):
	208	while True:
	209	for idx, (obj, file_desc, description, last_size, do_ignore) in \
	210	enumerate(zip(self.file_objs, self.file_descs,
	211	self.descriptions, self.last_sizes,
	212	self.ignore)):
e7d49180 CH	213	if do_ignore:
	214	continue
	215
	216	# get new file size
	217	new_size = os.fstat(file_desc).st_size
	218
	219	# compare to old size
	220	if new_size == last_size:
	221	if not self.is_used(file_desc):
	222	warn('no one is writing to {0} / {1} -- '
	223	'stop watching it!'
	224	.format(file_desc, description),
	225	category=LogReadWarning)
acc472a4	226	self.ignore[idx] = True
9f2fbfa7	227	else:
ea8b01a3 CH	228	if new_size < last_size: # happened at start of some tests
	229	warn('{0} / {1} has become smaller ({2} --> {3})! '
	230	.format(obj, description, last_size, new_size)
	231	+ 'Maybe you are reading from a half-initialized '
	232	+ 'file?',
9f2fbfa7	233	category=LogReadWarning)
e7d49180 CH	234	try:
	235	new_data = obj.read()
	236	except OSError as ose: # includes IOErrors
	237	warn('io error reading from {0} / {1}: {2})'
	238	.format(obj, description, ose),
	239	category=LogReadWarning)
9451b2ce CH	240	new_data = str(ose)
	241	except UnicodeDecodeError as ude:
	242	warn('unicode error reading from {0} / {1}: {2}'
	243	.format(obj, description, ude),
	244	category=LogReadWarning)
	245	new_data = str(ude)
e7d49180 CH	246
	247	# post-processing
	248	to_yield = self.prepare_result(description, new_data, idx)
	249	for result in to_yield:
	250	yield result
	251
	252	# prepare next iteration
	253	self.last_sizes[idx] = new_size
	254
	255	def prepare_result(self, description, data, idx):
	256	""" from raw new data create some yield-able results
	257
	258	to be intended for overwriting in sub-classes
	259
	260	this function is called from __iter__ for each new data that becomes
	261	available. It has to return some iterable whose entries are yielded
	262	from iteration over objects of this class.
	263
7c362208 CH	264	It receives the following args:
	265	- the description of the source
	266	- the data itself
	267	- the index of the source
	268	The result must be an iterable of objects, which are yielded as-is, so
	269	can have any form
	270
e7d49180 CH	271	This base implementation just returns its input in a list, so new data
	272	is yielded from __iter__ as-is
	273	"""
	274	return [(description, data), ]
	275
	276
	277	LINE_SPLITTERS = '\n\r'
	278
acc472a4	279
e7d49180 CH	280	class LineReader(IterativeReader):
e7d49180 CH	281	""" an IterativeReader that returns new data line-wise
edd68a74	282
e7d49180 CH	283	this means buffering partial line data
	284	"""
	285
	286	def __init__(self, args, *kwargs):
	287	""" forwards all args and kwargs to :py:class:`IterativeReader` """
01fe1580	288	super(LineReader, self).__init__(args, *kwargs)
d910eba5	289	self.line_buffers = ['' for _ in range(self.n_sources())]
e7d49180	290
d910eba5	291	def prepare_result(self, description, new_data, idx):
e7d49180 CH	292	""" take raw new data and split it into lines
	293
	294	if line is not complete, then buffer it
	295
	296	returns lines without their newline characters
	297	"""
e7d49180	298	all_data = self.line_buffers[idx] + new_data
d910eba5	299	self.line_buffers[idx] = ''
e7d49180 CH	300	result = []
e7d49180 CH	301	should_be_no_new_lines = False
879f0150	302	for line in all_data.splitlines(True):
e7d49180 CH	303	if line[-1] in LINE_SPLITTERS:
	304	result.append((description, line.rstrip(LINE_SPLITTERS)))
	305	elif should_be_no_new_lines:
	306	raise ValueError('line splitters are not compatible with'
	307	'str.splitlines!')
	308	else:
	309	self.line_buffers[idx] = line
	310	should_be_no_new_lines = True # (this should be the last)
	311
	312	return result
	313
	314
445afb23 CH	315	class LogParser(LineReader):
	316	""" takes lines from LineReader and parses their contents
	317
	318	requires a pattern for log lines, auto-detection is not implemented yet
	319
	320	Iteration returns re.match result or -- if matching failed -- the original
	321	raw line
	322	"""
	323
	324	def __init__(self, log_file, pattern=None):
	325	""" create a LogParser
	326
	327	:param str log_file: name of log file to parse (required!)
	328	:param pattern: regexp to split log lines; None (default) to return
	329	line as they are
	330	"""
	331	super(LogParser, self).__init__(log_file)
	332
	333	self.pattern = pattern
	334
	335	def prepare_result(self, *args):
	336	# let super class split data into lines
	337	for _, raw_line in super(LogParser, self).prepare_result(*args):
	338	result = re.match(self.pattern, raw_line)
	339	if result:
	340	yield result
	341	else:
	342	yield raw_line