developer.intra2net.com Git - pyi2ncommon/blob - src/log_read.py

   1 # The software in this package is distributed under the GNU General
   2 # Public License version 2 (with a special exception described below).
   3 #
   4 # A copy of GNU General Public License (GPL) is included in this distribution,
   5 # in the file COPYING.GPL.
   6 #
   7 # As a special exception, if other files instantiate templates or use macros
   8 # or inline functions from this file, or you compile this file and link it
   9 # with other works to produce a work based on this file, this file
  10 # does not by itself cause the resulting work to be covered
  11 # by the GNU General Public License.
  12 #
  13 # However the source code for this file must still be made available
  14 # in accordance with section (3) of the GNU General Public License.
  15 #
  16 # This exception does not invalidate any other reasons why a work based
  17 # on this file might be covered by the GNU General Public License.
  18 #
  19 # Copyright (c) 2016-2018 Intra2net AG <info@intra2net.com>
  20
  21 """
  22
  23 SUMMARY
  24 ------------------------------------------------------
  25 Iterative reading of log files, similar to shell command `tail -f`.
  26
  27 Copyright: Intra2net AG
  28
  29
  30 CONTENTS
  31 ------------------------------------------------------
  32
  33 Basic Functionality (class :py:class:`IterativeReader`):
  34 Runs stat in a loop to find out whether file size has changed. Then reads the
  35 new data and forwards that
  36
  37 .. todo:: Want to also use lsof to find out whether file/pipe/socket was
  38           closed, so can return from read loop
  39
  40 :py:class:`LineReader` takes output of :py:class:`IterativeReader` and returns
  41 it line-wise as is normal for log files
  42
  43 :py:class:`LogParser` takes those lines and tries to parse them into fields
  44 like date, time, module name, urgency and message.
  45
  46 .. todo:: auto-detect log line layout
  47
  48
  49 INTERFACE
  50 ------------------------------------------------------
  51
  52 """
  53
  54 import os
  55 import os.path
  56 import re
  57 from warnings import warn
  58 import logging
  59 from .iter_helpers import zip_longest
  60 from .type_helpers import is_str_or_byte, is_file_obj
  61
  62
  63 class LogReadWarning(UserWarning):
  64     """Warnings issued by classes in this module."""
  65     pass
  66
  67
  68 def true_func(_):
  69     """Replacement for :py:func:`check_is_used`. Returns `True` always."""
  70     return True
  71
  72
  73 def check_is_used(file_handle):
  74     """
  75     Check whether file is being written to.
  76
  77     To be implemented, e.g. using lsof.
  78
  79     If beneficial could also easily supply python file object as arg.
  80
  81     :param int file_handle: OS-level file descriptor
  82     """
  83     raise NotImplementedError(file_handle)
  84
  85
  86 #: counter for unknown sources in :py:func:`create_description`
  87 _create_description_unknown_counter = 0
  88
  89
  90 def create_description(file_obj, file_handle):
  91     """
  92     Create some description for given file-like object / file descriptor.
  93
  94     :param file_obj: file-like object
  95     :param int file_handle: os-level file descriptor
  96     :returns: Short description for file-like object
  97     :rtype: string
  98     """
  99     global _create_description_unknown_counter
 100
 101     try:
 102         desc = file_obj.name
 103         if desc:
 104             return desc
 105     except AttributeError:
 106         pass
 107
 108     if file_handle is not None:
 109         return 'file{0}'.format(file_handle)
 110     else:
 111         _create_description_unknown_counter += 1
 112         return 'unknown{0}'.format(_create_description_unknown_counter)
 113
 114
 115 #: error message for IterativeReader constructor
 116 _STR_ERR = 'not accepting file name "{0}" since cannot guarantee closing ' \
 117            'files --> use with open(file_name)!'
 118
 119
 120 class IterativeReader(object):
 121     """
 122     Read continuously from a given file.
 123
 124     Use `os.stat(file_obj.fileno()).st_size` as measure whether file has
 125     changed or not; Always reads as much data as possible.
 126
 127     Does not care about closing files, so does not accept file names.
 128
 129     This is the base for class :py:class:`LineReader` that just has to
 130     implement a different :py:meth:`prepare_result` method.
 131     """
 132
 133     def __init__(self, sources, descs=None, return_when_done=False):
 134         """
 135         Create a reader; do some basic checks on args.
 136
 137         :param sources: iterable over sources. Sources can be opened file
 138                         objects or read-opened os-level file descriptors.
 139                         Calling code has to ensure they are closed properly, so
 140                         best use this within a "with open(file_name) as
 141                         file_handle:"-context. If sources is a single file
 142                         obj/descriptor, both source and desc will be converted
 143                         to lists of length 1
 144         :param descs: can be anything of same length as sources. If sources is
 145                       a single source, then descs is also converted to a list
 146                       of length 1. If not given (i.e. None), will use
 147                       :py:func:`create_description` to guess descriptions
 148         :param bool return_when_done: ignore file_handle if no-one is writing
 149                                       to it any more. Return from iterator when
 150                                       all watched files are done (not
 151                                       implemented yet)
 152         :raises: OSError when testing fstat on source
 153         """
 154         if not sources:
 155             raise ValueError('need at least some source!')
 156         elif is_str_or_byte(sources):
 157             raise ValueError(_STR_ERR.format(sources))
 158         elif is_file_obj(sources) or isinstance(sources, int):
 159             source_input = [sources, ]
 160             desc_input = [descs, ]
 161         else:
 162             source_input = sources  # assume some iterable
 163             desc_input = descs
 164
 165         # now divide sources into os-level file descriptors for os.fstat,
 166         # and file objects for read()
 167         self.file_objs = []
 168         self.file_handles = []          # file descriptOR, not descriptION
 169         for source in source_input:
 170             if is_file_obj(source):
 171                 self.file_objs.append(source)
 172                 self.file_handles.append(source.fileno())
 173             elif isinstance(source, int):
 174                 self.file_objs.append(os.fdopen(source))
 175                 self.file_handles.append(source)
 176             elif is_str_or_byte(source):
 177                 raise ValueError(_STR_ERR.format(source))
 178             else:
 179                 raise ValueError('source {0} is neither file obj nor file '
 180                                  'descriptor!')
 181
 182             # try to fstat the new file descriptor just for testing
 183             os.fstat(self.file_handles[-1])
 184
 185         # guess descriptions if not given
 186         if not desc_input:
 187             self.descriptions = [create_description(obj, file_handle)
 188                                  for obj, file_handle
 189                                  in zip(self.file_objs, self.file_handles)]
 190         else:
 191             try:
 192                 if len(desc_input) != len(self.file_objs):
 193                     raise ValueError('need same number of sources and '
 194                                      'descriptions!')
 195             except TypeError:
 196                 pass  # desc_input is generator or so
 197
 198             self.descriptions = []
 199             for obj, file_handle, description in \
 200                     zip_longest(self.file_objs, self.file_handles, desc_input):
 201                 if obj is None:
 202                     raise ValueError('more descriptions than sources!')
 203                 elif description is None:
 204                     self.descriptions.append(create_description(obj,
 205                                                                 file_handle))
 206                 else:
 207                     self.descriptions.append(description)
 208
 209         self.last_sizes = [0 for _ in self.file_objs]
 210         self.ignore = [False for _ in self.file_objs]
 211
 212         if return_when_done:
 213             self.is_used_func = check_is_used
 214         else:
 215             self.is_used_func = true_func
 216
 217         for obj, file_handle, description in \
 218                 zip(self.file_objs, self.file_handles, self.descriptions):
 219             logging.debug('log_read initialized with file descriptor {0}, '
 220                           'file obj {1}, description "{2}"'
 221                           .format(file_handle, obj, description))
 222
 223     def n_sources(self):
 224         """Return number of sources given to constructor."""
 225         return len(self.file_objs)
 226
 227     def n_active_sources(self):
 228         """Return number of sources we are actually watching."""
 229         return len(self.ignore) - sum(self.ignore)
 230
 231     def __iter__(self):
 232         """
 233         Continue reading from sources, yield results.
 234
 235         yields result of :py:meth:`prepare_result`, which depends on what sub
 236         class you called this function from.
 237         """
 238         while True:
 239             for idx, (obj, file_handle, description, last_size, do_ignore) in \
 240                     enumerate(zip(self.file_objs, self.file_handles,
 241                                   self.descriptions, self.last_sizes,
 242                                   self.ignore)):
 243                 if do_ignore:
 244                     continue
 245
 246                 # get new file size
 247                 new_size = os.fstat(file_handle).st_size
 248
 249                 # compare to old size
 250                 if new_size == last_size:
 251                     if not self.is_used_func(file_handle):
 252                         warn('no one is writing to {0} / {1} -- '
 253                              'stop watching it!'
 254                              .format(file_handle, description),
 255                              category=LogReadWarning)
 256                         self.ignore[idx] = True
 257                 else:
 258                     if new_size < last_size:  # happened at start of some tests
 259                         warn('{0} / {1} has become smaller ({2} --> {3})! '
 260                              .format(obj, description, last_size, new_size)
 261                              + 'Maybe you are reading from a half-initialized '
 262                              + 'file?',
 263                              category=LogReadWarning)
 264                     try:
 265                         new_data = obj.read()
 266                     except OSError as ose:    # includes IOErrors
 267                         warn('io error reading from {0} / {1}: {2})'
 268                              .format(obj, description, ose),
 269                              category=LogReadWarning)
 270                         new_data = str(ose)
 271                     except UnicodeDecodeError as ude:
 272                         warn('unicode error reading from {0} / {1}: {2}'
 273                              .format(obj, description, ude),
 274                              category=LogReadWarning)
 275                         new_data = str(ude)
 276
 277                     # post-processing
 278                     to_yield = self.prepare_result(description, new_data, idx)
 279                     for result in to_yield:
 280                         yield result
 281
 282                     # prepare next iteration
 283                     self.last_sizes[idx] = new_size
 284
 285     def prepare_result(self, description, data, idx):
 286         """
 287         From raw new data create some yield-able results.
 288
 289         Intended for overwriting in subclasses.
 290
 291         This function is called from __iter__ for each new data that becomes
 292         available. It has to return some iterable whose entries are yielded
 293         from iteration over objects of this class.
 294
 295         This base implementation just returns its input in a list, so new data
 296         is yielded from __iter__ as-is. Subclass implementations can also yield
 297         tuples.
 298
 299         :param str description: Description of source of lines, one of
 300                                 :py:data:`self.descriptions`
 301         :param str data: Text data read from source
 302         :param idx: Index of data source
 303         :returns: [(description, data, idx], same as input
 304         :rtype [(str, str, int)]
 305         """
 306         return [(description, data, idx), ]
 307
 308
 309 #: characters to `rstrip()` from end of complete lines
 310 LINE_SPLITTERS = '\n\r'
 311
 312
 313 class LineReader(IterativeReader):
 314     """
 315     An :py:class:`IterativeReader` that returns new data line-wise.
 316
 317     This means buffering partial line data.
 318     """
 319
 320     def __init__(self, *args, **kwargs):
 321         """Create an :py:class:`IterativeReader and buffers for sources."""
 322         super(LineReader, self).__init__(*args, **kwargs)
 323         self.line_buffers = ['' for _ in range(self.n_sources())]
 324
 325     def prepare_result(self, description, new_data, idx):
 326         """
 327         Take raw new data and split it into lines.
 328
 329         If line is not complete, then buffer it.
 330
 331         Args: see super class method :py:meth:`IterativeReader.prepare_result`
 332         :returns: list of 3-tuples `(description, line, idx)` where
 333                   `description` and `idx` are same as args, and `line` is
 334                   without trailing newline characters
 335         :rtype: [(str, str, int)]
 336         """
 337         all_data = self.line_buffers[idx] + new_data
 338         self.line_buffers[idx] = ''
 339         result = []
 340         should_be_no_new_lines = False
 341         for line in all_data.splitlines(True):
 342             if line[-1] in LINE_SPLITTERS:
 343                 result.append((description, line.rstrip(LINE_SPLITTERS), idx))
 344             elif should_be_no_new_lines:
 345                 # self-check
 346                 raise ValueError('Programming error: something went wrong with '
 347                                  'line splitting/buffering.')
 348             else:
 349                 self.line_buffers[idx] = line
 350                 should_be_no_new_lines = True  # (this should be the last)
 351
 352         return result
 353
 354
 355 class LogParser(LineReader):
 356     """
 357     Takes lines from :py:class:`LineReader` and parses their contents.
 358
 359     Requires a pattern for log lines, auto-detection is not implemented yet.
 360
 361     Iteration returns re.match result or -- if matching failed -- the original
 362     raw line.
 363     """
 364
 365     def __init__(self, log_file, pattern=None):
 366         """
 367         Create a LogParser.
 368
 369         :param str log_file: name of log file to parse (required!)
 370         :param pattern: regexp to split log lines; None (default) to return
 371                         line as they are
 372         :type pattern: str or None (default)
 373         """
 374         super(LogParser, self).__init__(log_file)
 375
 376         self.pattern = pattern
 377
 378     def prepare_result(self, *args):
 379         """
 380         Try parsing lines.
 381
 382         Args: see super class method :py:meth:`IterativeReader.prepare_result`
 383         :returns: 3-tuples `(description, line, idx)` where `description` and
 384                   `idx` are same as input args and `line` is either a
 385                   :py:class:`re.Match` if line matched :py:data:`self.pattern`
 386                   or just str if line did not match.
 387         :rtype: [(str, :py:class:`re.Match` OR str, int)]
 388         """
 389         # let super class split data into lines
 390         result = []
 391         for description, raw_line, idx in \
 392                 super(LogParser, self).prepare_result(*args):
 393             matches = re.match(self.pattern, raw_line)
 394             if matches:
 395                 result.append((description, matches, idx))
 396             else:
 397                 result.append((description, raw_line, idx))
 398         return result